Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

V15 Fix molecular-subtype-chordoma analysis #590

Merged
merged 15 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:

- run:
name: Molecular subtyping Chordoma
command: ./scripts/run_in_ci.sh Rscript -e "rmarkdown::render('analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd', clean = TRUE)"
command: OPENPBTA_SUBSET=0 ./scripts/run_in_ci.sh bash analyses/molecular-subtyping-chordoma/run-molecular-subtyping-chordoma.sh

- run:
name: Molecular subtyping - Ependymoma
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# This script subsets the focal copy number, RNA expression, histologies`
# files to include only Chordoma samples.

# Written originally Chante Bethell 2019
# (Adapted for this module by Candace Savonen 2020)
#
# #### USAGE
# This script is intended to be run via the command line from the top directory
# of the repository as follows:
#
# Rscript 'analyses/molecular-subtyping-chordoma/00-subset-files-for-chordoma.R'


#### Set Up --------------------------------------------------------------------

# Get `magrittr` pipe
`%>%` <- dplyr::`%>%`

#### Directories and Files -----------------------------------------------------

# Detect the ".git" folder -- this will in the project root directory.
# Use this as the root directory to ensure proper sourcing of functions no
# matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))

# Set path to results and plots directory
results_dir <-
file.path(root_dir, "analyses", "molecular-subtyping-chordoma", "chordoma-subset")

if (!dir.exists(results_dir)) {
dir.create(results_dir)
}

# Read in metadata
metadata <-
readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv"))

#### Filter metadata -----------------------------------------------------------
# Select wanted columns in metadata for merging and assign to a new object
chordoma_metadata <- metadata %>%
dplyr::select(
sample_id,
Kids_First_Participant_ID,
Kids_First_Biospecimen_ID
) %>%
dplyr::filter(short_histology == "Chordoma")

#### Filter expression data ----------------------------------------------------
# Read in the stranded expression data file
expression_data <- read_rds(file.path(
"data",
"pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"
))

# Filter to Chordoma samples only -- we can use chordoma_df because it is subset to
# RNA-seq samples
expression_data <- expression_data %>%
dplyr::select(intersect(
chordoma_metadata$Kids_First_Biospecimen_ID,
colnames(expression_data)
)) %>%
readr::write_rds(file.path(
results_dir,
"chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds"
))

#### Filter focal CN data ------------------------------------------------------

# Filter focal CN to Chordoma samples only
cn_metadata <- data.table::fread(file.path(
"data",
"consensus_seg_annotated_cn_autosomes.tsv.gz"
)) %>%
dplyr::left_join(chordoma_metadata,
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")
) %>%
dplyr::select(
gene_symbol,
sample_id,
Kids_First_Participant_ID,
biospecimen_id,
status
) %>%
dplyr::filter(sample_id %in% chordoma_metadata$sample_id) %>%
# Write to file
readr::write_tsv(file.path(results_dir, "chordoma-only_cn_autosomes.tsv.gz"))
195 changes: 102 additions & 93 deletions analyses/molecular-subtyping-chordoma/01-Subtype-chordoma.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,42 @@ library(ggplot2)
### Read in data

```{r}
data_dir <- file.path("..", "..", "data")
histologies_df <- read_tsv(file.path(data_dir,
"pbta-histologies.tsv"),
col_types = cols(
molecular_subtype = col_character()
))
# File path to subsetted files directory
input_dir <- "chordoma-subset"
```

Set up metadata

```{r}
# TODO: update to use consensus file with annotations included in the data
# download?
focal_cn_df <-
read_tsv(file.path("..", "focal-cn-file-preparation", "results",
"consensus_seg_annotated_cn_autosomes.tsv.gz"))
# Read in non-subsetted metadata
histologies_df <- readr::read_tsv(file.path("..", "..", "data", "pbta-histologies.tsv"))

# Subset metadata
subset_metadata <- histologies_df %>%
dplyr::filter(short_histology == "Chordoma") %>%
select(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're calling the filter function above using dplyr:: but not doing the same for select here. Is this because filter was behaving similarly to rename as you noted in your original comment? If so, then disregard this comment. If not, removing the dplyr:: altogether (except for in the case of the rename function) or adding the dplyr:: to the rest of the functions would be nice for consistency purposes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filter also has a base:: function that can mess with stuff sometimes.

Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
experimental_strategy
)
```

```{r}
# we need to include the sample_id field from pbta-histologies.tsv in the final
# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
# event for a given individual). To get the SMARCB1 jitter plot in the photo
# here #250 (comment), you will first need to read in the collapsed expression
subset_focal_cn_df <-
data.table::fread(file.path(input_dir, "chordoma-only_cn_autosomes.tsv.gz"))
```

```{r}
# we need to include the sample_id field from pbta-histologies.tsv in the final
# table (field will allow #us to map between RNA-seq (e.g., SMARCB1 expression
# values) and WGS data (e.g., SMARCB1 focal copy number status) from the same
# event for a given individual). To get the SMARCB1 jitter plot in the photo
# here #250 (comment), you will first need to read in the collapsed expression
# data
expression_data <-
read_rds(file.path(data_dir,
"pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
subset_expression_data <-
read_rds(file.path(
input_dir,
"chordoma-only-gene-expression-rsem-fpkm-collapsed.stranded.rds"
))
```

### Output
Expand All @@ -66,59 +76,38 @@ output_file <- file.path(results_dir, "chordoma_smarcb1_status.tsv")

## Prepare the data

Extracting the biospecimen IDs that correspond to chordoma samples
Extracting the chordoma samples that have a loss of SMARCB1 from `subset_focal_cn_df`

```{r}
chordoma_samples <- histologies_df %>%
filter(short_histology == "Chordoma") %>%
pull(Kids_First_Biospecimen_ID)
```

Extracting the chordoma samples that have a loss of SMARCB1 from `focal_cn_df`

```{r}
chordoma_loss <- focal_cn_df %>%
filter(biospecimen_id %in% chordoma_samples,
gene_symbol == "SMARCB1",
status == "loss")
chordoma_loss
```

```{r}
# remove large copy number data frame
rm(focal_cn_df)
```

Modifying the output file to filter only chorodoma diagnoses and removing columns with information not relevant to the study
```{r}
chordoma_id_df <- histologies_df %>%
# only rows with chordoma samples
filter(short_histology == "Chordoma") %>%
# select only these columns that we'll need later
select(Kids_First_Biospecimen_ID, sample_id, Kids_First_Participant_ID,
experimental_strategy)
chordoma_id_df
chordoma_loss <- subset_focal_cn_df %>%
filter(
gene_symbol == "SMARCB1",
status == "loss"
)
chordoma_loss
```

Distinguishing the chordoma samples with no copy number change chromosome 22
```{r}
copy_neutral_df <- chordoma_id_df %>%
copy_neutral_df <- subset_metadata %>%
# the copy events can only be taken from WGS data not RNA-seq data
# we also only want biospecimens where a loss was not recorded to avoid duplicates
filter(experimental_strategy == "WGS",
!(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)) %>%
filter(
experimental_strategy == "WGS",
!(Kids_First_Biospecimen_ID %in% chordoma_loss$biospecimen_id)
) %>%
# if there's no loss, let's assume status is copy neutral
mutate(status = "neutral") %>%
# let's get the columns to match chordoma_loss
rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
dplyr::rename(biospecimen_id = Kids_First_Biospecimen_ID) %>%
select(biospecimen_id, status)
copy_neutral_df
```

Joining chordoma samples (both with chr 22 loss and no copy number change)
```{r}
chordoma_copy <- chordoma_loss %>%
#join the losses with the neutrals to get a new data frame
chordoma_copy <- chordoma_loss %>%
# join the losses with the neutrals to get a new data frame
select(biospecimen_id, status) %>%
bind_rows(copy_neutral_df)
chordoma_copy
Expand All @@ -129,10 +118,13 @@ Need to get the sample_id that corresponds to biospecimen_id into chordoma_copy
chordoma_copy <- chordoma_copy %>%
# get only the Kids_First_Biospecimen_ID, sample_id columns from our identifier data.frame
# then use biospecimen IDs to add the sample_id info
inner_join(select(chordoma_id_df,
Kids_First_Biospecimen_ID,
sample_id),
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID"))
inner_join(select(
subset_metadata,
Kids_First_Biospecimen_ID,
sample_id
),
by = c("biospecimen_id" = "Kids_First_Biospecimen_ID")
)
chordoma_copy
```

Expand All @@ -141,16 +133,15 @@ Look at SMARCB1 expression values only in chordoma
```{r}
# get the row that contains the SMARCB1 values
# gene symbols are rownames
smarcb1_expression <- expression_data[which(rownames(expression_data) == "SMARCB1"), ]

# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(expression_data) %in% chordoma_samples) ]
smarcb1_expression
smarcb1_expression <- subset_expression_data[which(rownames(subset_expression_data) == "SMARCB1"), ]
```

```{r}
# remove large expression matrix that's no longer needed
rm(expression_data)
# now only the columns correspond to chordoma samples
smarcb1_expression <- smarcb1_expression[, which(colnames(subset_expression_data) %in% subset_metadata$Kids_First_Biospecimen_ID) ]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line looks weird (format-wise), did you run this through a styler?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did but it doesn't do anything with really long lines. I'd rather use dplyr::filter here anyway but was trying to keep the original code here.


# Print out the expression for SMARCB1
smarcb1_expression
```

The `smarcb1_expression` is a not a friendly form ^^; Transposing needed:
Expand All @@ -163,17 +154,20 @@ smarcb1_expression <- t(smarcb1_expression) %>%
# we want the rownames that are biospecimen identifers as their own column called Kids_First_Biospecimen_ID
tibble::rownames_to_column("Kids_First_Biospecimen_ID") %>%
# give SMARCB1 column a slightly better column name
rename(SMARCB1_expression = SMARCB1)
dplyr::rename(SMARCB1_expression = SMARCB1)
smarcb1_expression
```

This also needs sample_id to add it in
```{r}
smarcb1_expression <- smarcb1_expression %>%
inner_join(select(chordoma_id_df,
Kids_First_Biospecimen_ID,
sample_id),
by = "Kids_First_Biospecimen_ID")
inner_join(select(
subset_metadata,
Kids_First_Biospecimen_ID,
sample_id
),
by = "Kids_First_Biospecimen_ID"
)
smarcb1_expression
```

Expand All @@ -182,29 +176,34 @@ Joining the copy number data with the expression data in this step
chordoma_smarcb1_df <- smarcb1_expression %>%
# any missing samples will get filled with NA when using a full join
full_join(chordoma_copy, by = "sample_id") %>%
rename(Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID,
Kids_First_Biospecimen_ID_RNA = biospecimen_id)
dplyr::rename(
Kids_First_Biospecimen_ID_DNA = Kids_First_Biospecimen_ID,
Kids_First_Biospecimen_ID_RNA = biospecimen_id
)

# this step adds in the participant identifier (sample_id to match between the two data.frame)
chordoma_smarcb1_df <- chordoma_id_df %>%
chordoma_smarcb1_df <- subset_metadata %>%
select(sample_id, Kids_First_Participant_ID) %>%
distinct() %>%
inner_join(chordoma_smarcb1_df,
by = "sample_id")
by = "sample_id"
)

chordoma_smarcb1_df
```

```{r}
chordoma_smarcb1_df <- chordoma_smarcb1_df %>%
select(Kids_First_Participant_ID,
Kids_First_Biospecimen_ID_DNA,
Kids_First_Biospecimen_ID_RNA,
sample_id,
status,
SMARCB1_expression) %>%
select(
Kids_First_Participant_ID,
Kids_First_Biospecimen_ID_DNA,
Kids_First_Biospecimen_ID_RNA,
sample_id,
status,
SMARCB1_expression
) %>%
# 'status' is replaced a more descriptive name
rename(focal_SMARCB1_status = status)
dplyr::rename(focal_SMARCB1_status = status)
chordoma_smarcb1_df
```

Expand All @@ -218,20 +217,24 @@ chordoma_smarcb1_df %>%
# drop the sample that doesn't have WGS data
tidyr::drop_na() %>%
# this step specifies what should go on the x- and y-axes
ggplot(aes(x = focal_SMARCB1_status,
y = SMARCB1_expression)) +
# we want a jitter plot where the points aren't too far
ggplot(aes(
x = focal_SMARCB1_status,
y = SMARCB1_expression
)) +
# we want a jitter plot where the points aren't too far
# apart that's what width does
geom_jitter(width = 0.1) +
# this is plotting the median as a blue diamond
stat_summary(fun.y = "median",
geom = "point",
size = 3,
color = "blue",
shape = 18) +
xlab("SMARCB1 status") +
xlab("SMARCB1 status") +
ylab("SMARCB1 expression")
stat_summary(
fun.y = "median",
geom = "point",
size = 3,
color = "blue",
shape = 18
) +
xlab("SMARCB1 status") +
xlab("SMARCB1 status") +
ylab("SMARCB1 expression")
```

```{r}
Expand All @@ -244,3 +247,9 @@ Write the table to file.
chordoma_smarcb1_df %>%
write_tsv(output_file)
```

### Session Info

```{r}
sessionInfo()
```
Loading