Skip to content
This repository has been archived by the owner on Jun 21, 2023. It is now read-only.

Commit

Permalink
update Sample distribution analysis for v15 (#584)
Browse files Browse the repository at this point in the history
* Change disease_type_new to integrated_diagnosis

* Output updates for v15

* Comment out all analyses from CI but this one

* Revert "Comment out all analyses from CI but this one"

This reverts commit 352ca08.

Co-authored-by: Candace Savonen <cansav09@gmail.com>
  • Loading branch information
jashapiro and cansavvy authored Mar 2, 2020
1 parent 35813d9 commit d5617a0
Show file tree
Hide file tree
Showing 10 changed files with 849 additions and 831 deletions.
22 changes: 11 additions & 11 deletions analyses/sample-distribution-analysis/01-filter-across-types.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ location_fn <- function(location) {
disease_type_vector <- brain_location %>%
dplyr::arrange(dplyr::desc(n)) %>%
dplyr::filter(stringr::str_detect(primary_site, location)) %>%
dplyr::pull(disease_type_new)
dplyr::pull(integrated_diagnosis)
unique(disease_type_vector)
}

Expand All @@ -55,7 +55,7 @@ if (!dir.exists(plots_dir)) {
histologies_df <-
readr::read_tsv(file.path(root_dir, "data", "pbta-histologies.tsv")) %>%
as.data.frame() %>%
dplyr::filter(!is.na(disease_type_new))
dplyr::filter(!is.na(integrated_diagnosis))

# Filter the histologies file to account for multiple samples from the same
# individual and the fact that multiple experimental strategies are in this
Expand All @@ -68,9 +68,9 @@ histologies_df <- histologies_df %>%

# data.frame with the count of each unique cancer type expression
disease_expression <- histologies_df %>%
# some recurrences can have different disease_type_new values
dplyr::distinct(Kids_First_Participant_ID, disease_type_new) %>%
dplyr::group_by(disease_type_new) %>%
# some recurrences can have different integrated_diagnosis values
dplyr::distinct(Kids_First_Participant_ID, integrated_diagnosis) %>%
dplyr::group_by(integrated_diagnosis) %>%
dplyr::count(name = "count") %>%
dplyr::arrange(dplyr::desc(count))

Expand All @@ -83,8 +83,8 @@ disease_expression <- disease_expression %>%
dplyr::mutate(percent = paste0((round(count / sum_count, 4) * 100), "%"))

# Reorder the columns to be displayed in descending order by count on the plot
disease_expression$disease_type_new <- with(disease_expression,
reorder(disease_type_new, -count))
disease_expression$integrated_diagnosis <- with(disease_expression,
reorder(integrated_diagnosis, -count))

# Write to tsv file
readr::write_tsv(disease_expression,
Expand All @@ -93,7 +93,7 @@ readr::write_tsv(disease_expression,

# Create a bar plot of sample distribution across cancer types
gg_types <- disease_expression %>%
ggplot2::ggplot(ggplot2::aes(x = disease_type_new, y = count, fill = count)) +
ggplot2::ggplot(ggplot2::aes(x = integrated_diagnosis, y = count, fill = count)) +
ggplot2::geom_col() +
ggplot2::theme_bw() +
ggplot2::labs(x = "Cancer Types", y = "Count",
Expand Down Expand Up @@ -122,10 +122,10 @@ ggplot2::ggsave(
# data.frame with the location where each cancer type in the dataset is
# expressed, sorted to show highest expression
brain_location <- histologies_df %>%
dplyr::distinct(Kids_First_Participant_ID, disease_type_new,
dplyr::distinct(Kids_First_Participant_ID, integrated_diagnosis,
primary_site) %>%
dplyr::select(disease_type_new, primary_site) %>%
dplyr::group_by(disease_type_new, primary_site) %>%
dplyr::select(integrated_diagnosis, primary_site) %>%
dplyr::group_by(integrated_diagnosis, primary_site) %>%
dplyr::tally() %>%
dplyr::arrange(dplyr::desc(n))

Expand Down
8 changes: 4 additions & 4 deletions analyses/sample-distribution-analysis/02-multilayer-plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,21 @@ final_df <- histologies_df %>%
dplyr::filter(sample_type == "Tumor",
composition == "Solid Tissue") %>%
dplyr::distinct(Kids_First_Participant_ID, broad_histology,
short_histology, disease_type_new) %>%
short_histology, integrated_diagnosis) %>%
# Select our 3 columns of interest
dplyr::select(broad_histology, short_histology, disease_type_new) %>%
dplyr::select(broad_histology, short_histology, integrated_diagnosis) %>%
# Remove any row that has an NA
dplyr::filter(complete.cases(.)) %>%
# Group by all 3 columns in order to count
dplyr::group_by(broad_histology, short_histology, disease_type_new) %>%
dplyr::group_by(broad_histology, short_histology, integrated_diagnosis) %>%
# Add the count to a column named size
dplyr::add_count(name = "size") %>%
# Place the value 1 in a column named counter for treemap and sunburt plots
dplyr::mutate(counter= c(1)) %>%
# Change the column names
dplyr::rename(level1 = broad_histology,
level2 = short_histology,
level3 = disease_type_new) %>%
level3 = integrated_diagnosis) %>%
# Reorder the rows according to the 3 levels
dplyr::arrange(level1, level2, level3) %>%
# tbl_df -> data.frame
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ For `PT_HT4HJXY6`, there is WGS data for both the primary CNS tumor and the seco

## By histology

We're going to use the `disease_type_new` column here.
We're going to use the `integrated_diagnosis` column here.

### Primary only

Expand All @@ -215,8 +215,8 @@ This table is different than what is plotted upstream in this module because we
```{r}
tumor_df %>%
filter(tumor_descriptor == "Initial CNS Tumor") %>%
distinct(Kids_First_Participant_ID, disease_type_new) %>%
group_by(disease_type_new) %>%
distinct(Kids_First_Participant_ID, integrated_diagnosis) %>%
group_by(integrated_diagnosis) %>%
tally() %>%
arrange(desc(n)) %>%
regulartable() %>%
Expand All @@ -228,7 +228,7 @@ tumor_df %>%
```{r}
disease_types_descriptors <- tumor_df %>%
group_by(Kids_First_Participant_ID) %>%
summarize(disease_types = paste(sort(unique(disease_type_new)),
summarize(disease_types = paste(sort(unique(integrated_diagnosis)),
collapse = ", "),
descriptors = paste(sort(unique(tumor_descriptor)),
collapse = ", ")) %>%
Expand Down

Large diffs are not rendered by default.

Binary file not shown.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,70 +1,63 @@
disease_type_new count percent
Low-grade glioma;astrocytoma (WHO grade I/II) 239 24.92%
Medulloblastoma 119 12.41%
High-grade glioma;astrocytoma (WHO grade III/IV) 101 10.53%
Ependymoma 86 8.97%
Ganglioglioma 46 4.8%
Brainstem glioma- Diffuse intrinsic pontine glioma 45 4.69%
Craniopharyngioma 39 4.07%
Atypical Teratoid Rhabdoid Tumor 28 2.92%
Meningioma 27 2.82%
Dysembryoplastic neuroepithelial tumor 25 2.61%
Neurofibroma;Plexiform 19 1.98%
Choroid plexus papilloma 16 1.67%
Schwannoma 16 1.67%
Supratentorial or Spinal Cord PNET 16 1.67%
Dysplasia;Gliosis 14 1.46%
Teratoma 8 0.83%
Ewings Sarcoma 7 0.73%
Metastatic secondary tumors 5 0.52%
Adenoma 4 0.42%
Choroid plexus carcinoma 4 0.42%
Cyst 4 0.42%
Germinoma 4 0.42%
Glial-neuronal tumor NOS 4 0.42%
Langerhans Cell histiocytosis 4 0.42%
Malignant peripheral nerve sheath tumor 4 0.42%
Neuroblastoma 4 0.42%
Pineoblastoma 4 0.42%
Sarcoma 4 0.42%
integrated_diagnosis count percent
Low-grade astrocytic tumor 239 24.59%
Medulloblastoma 118 12.14%
High-grade glioma 90 9.26%
Ependymoma 84 8.64%
Diffuse midline glioma 56 5.76%
Ganglioglioma 45 4.63%
Craniopharyngioma 39 4.01%
Atypical Teratoid Rhabdoid Tumor 28 2.88%
Dysembryoplastic neuroepithelial tumor 27 2.78%
Meningioma 27 2.78%
CNS Embryonal Tumor 20 2.06%
Neurofibroma 19 1.95%
Choroid plexus papilloma 16 1.65%
Schwannoma 16 1.65%
Brainstem glioma- Diffuse intrinsic pontine glioma 14 1.44%
Dysplasia 14 1.44%
Teratoma 8 0.82%
Ewings Sarcoma 7 0.72%
Embryonal tumor with multilayer rosettes 6 0.62%
Germinoma 5 0.51%
Metastatic secondary tumors 5 0.51%
Adenoma 4 0.41%
Choroid plexus carcinoma 4 0.41%
Cyst 4 0.41%
Glial-neuronal tumor NOS 4 0.41%
Langerhans Cell histiocytosis 4 0.41%
Malignant peripheral nerve sheath tumor 4 0.41%
Neuroblastoma 4 0.41%
Sarcoma 4 0.41%
Chordoma 3 0.31%
Dermoid Cyst 3 0.31%
Meningioangiomatosis 3 0.31%
Neurocytoma 3 0.31%
Subependymal Giant Cell Astrocytoma (SEGA) 3 0.31%
Pineoblastoma 3 0.31%
Subependymal Giant Cell Astrocytoma 3 0.31%
Cavernoma 2 0.21%
Ganglioneuroblastoma 2 0.21%
Gliomatosis Cerebri 2 0.21%
Hemangioblastoma 2 0.21%
Non-Langerhans Histiocytosis;JXG 2 0.21%
JXG- Non-Langerhans Histiocytosis 2 0.21%
Low-grade mixed tumor 2 0.21%
Oligodendroglioma 2 0.21%
Osteoblastoma 2 0.21%
Rhabdomyosarcoma 2 0.21%
Brain arteriovenous malformation 1 0.1%
Chondrosarcoma 1 0.1%
Choroid plexus cyst 1 0.1%
CNS embryonal tumor 1 0.1%
CNS Neuroblastoma 1 0.1%
Cortical Tubers 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Dysplasia/Gliosis;Ganglioglioma 1 0.1%
Dysembryoplastic neuroepithelial tumor (DNET);Ganglioglioma 1 0.1%
Dysplasia;Gliosis;Glial-neuronal tumor NOS 1 0.1%
Embryonal tumor with multilayer rosettes NOS 1 0.1%
Ependymoblastoma 1 0.1%
Fibroma 1 0.1%
Fibromyxoid Tumor 1 0.1%
Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II) 1 0.1%
Ganglioneuroma 1 0.1%
Germinoma;Teratoma 1 0.1%
Hamartoma 1 0.1%
Intraneural perineurioma 1 0.1%
Malignant melanocytic neoplasm 1 0.1%
Medulloepithelioma 1 0.1%
Metastatic secondary tumors;Neuroblastoma 1 0.1%
Myeloid Sarcoma 1 0.1%
Myofibroblastoma 1 0.1%
Myxoid spindle cell tumor 1 0.1%
NeuroInflammatory systemic disease 1 0.1%
Non-germinomatous germ cell tumor;Teratoma 1 0.1%
Non-germinomatous germ cell tumor- Teratoma 1 0.1%
Ossifying Fibroma 1 0.1%
Papillary glioneuronal tumor 1 0.1%
Primary CNS lymphoma 1 0.1%
Expand Down
Loading

0 comments on commit d5617a0

Please sign in to comment.