AlexsLemonade · jaclyn-taroni · Jul 19, 2021 · Jun 2, 2021 · Jun 2, 2021 · Jun 3, 2021
diff --git a/analyses/molecular-subtyping-LGAT/00-LGAT-select-pathology-dx.R b/analyses/molecular-subtyping-LGAT/00-LGAT-select-pathology-dx.R
@@ -32,9 +32,10 @@ exclude_path_dx <- stringr::str_to_lower(
     "Dysembryoplastic neuroepithelial tumor"
   ))
 
-# Exclusion criteria on the basis of pathology_free_text_diagnosis per 
-# https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/995
-exclude_path_free_text <- stringr::str_to_lower(
+# Update:Recode criteria on the basis of pathology_free_text_diagnosis  
+# We were removing these as per https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/995
+# but we want to keep these with recode subtypes as GNT now
+recode_path_free_text <- stringr::str_to_lower(
   c(
     "desmoplastic infantile astrocytoma",
     "glioneuronal"  # This also covers the more specific cases (e.g., rosette forming glioneuronal tumor)
@@ -43,7 +44,7 @@ exclude_path_free_text <- stringr::str_to_lower(
 # Create a list with the strings we'll use for inclusion/exclusion
 terms_list <- list(include_path_dx = include_path_dx,
                    exclude_path_dx = exclude_path_dx,
-                   exclude_path_free_text = exclude_path_free_text)
+                   recode_path_free_text = recode_path_free_text)
 
 # Write to file
 writeLines(jsonlite::prettify(jsonlite::toJSON(terms_list)), output_file)
diff --git a/analyses/molecular-subtyping-LGAT/01-subset-files-for-LGAT.Rmd b/analyses/molecular-subtyping-LGAT/01-subset-files-for-LGAT.Rmd
@@ -124,25 +124,6 @@ lgat_specimens_df <- clinical %>%
                 sample_type == "Tumor",
                 composition == "Solid Tissue")
 
-# We only exclude on the basis of the strings in pathology free text diagnosis
-# when pathology diagnosis indicates LGG because ganglioglioma tumors are 
-# glial-neuronal tumors that should be subtyped in this module
-# See: https://github.com/AlexsLemonade/OpenPBTA-analysis/issues/995#issuecomment-822744120
-lgat_specimens_df <- lgat_specimens_df %>%
-  dplyr::filter(
-    # If the pathology diagnosis is LGG, exclude samples with matching pathology free text
-    # diagnosis but retain samples if pathology free text is NA!
-    (pathology_diagnosis == "Low-grade glioma/astrocytoma (WHO grade I/II)" &
-       (str_detect(str_to_lower(pathology_free_text_diagnosis),
-                   paste(path_dx_list$exclude_path_free_text,
-                         collapse = "|"),
-                   negate = TRUE) |
-          is.na(pathology_free_text_diagnosis))) |
-      # Or if the pathology diagnosis is one of these three entries, include the samples
-      pathology_diagnosis %in% c("Ganglioglioma",
-                                 "Ganglioglioma;Low-grade glioma/astrocytoma (WHO grade I/II)",
-                                 "Low-grade glioma/astrocytoma (WHO grade I/II);Other"))
-
 # Write this intermediate file to the subset directory as it allows for
 # inspection
 write_tsv(lgat_specimens_df, file.path(subset_dir, "lgat_metadata.tsv"))

diff --git a/analyses/molecular-subtyping-LGAT/01-subset-files-for-LGAT.nb.html b/analyses/molecular-subtyping-LGAT/01-subset-files-for-LGAT.nb.html
diff --git a/analyses/molecular-subtyping-LGAT/03-subset-cnv-files-LGAT.nb.html b/analyses/molecular-subtyping-LGAT/03-subset-cnv-files-LGAT.nb.html
@@ -2354,7 +2354,7 @@ <h3>Gather CNV subsets</h3>
 <!-- rnb-source-end -->
 <div data-pagedtable="false">
 <script data-pagedtable-source type="application/json">
-{"columns":[{"label":["n"],"name":[1],"type":["dbl"],"align":["right"]}],"data":[{"1":"3.396226"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
+{"columns":[{"label":["n"],"name":[1],"type":["dbl"],"align":["right"]}],"data":[{"1":"3.345725"}],"options":{"columns":{"min":{},"max":[10]},"rows":{"min":[10],"max":[10]},"pages":{}}}
   </script>
 </div>
 <!-- rnb-source-begin eyJkYXRhIjoid3JpdGVfdHN2KExHQVRfY252X3N1YnNldCwgZmlsZS5wYXRoKHN1YnNldF9kaXIsIFwiTEdBVF9jbnZfc3Vic2V0LnRzdlwiKSkifQ== -->

diff --git a/analyses/molecular-subtyping-LGAT/04-LGAT-compile-subtypes.Rmd b/analyses/molecular-subtyping-LGAT/04-LGAT-compile-subtypes.Rmd
@@ -44,40 +44,53 @@ lgat_metadata_file <- file.path("lgat-subset", "lgat_metadata.tsv")
 bsid_to_include <- read_tsv(lgat_metadata_file) %>%
   pull(Kids_First_Biospecimen_ID)
 
+# File from 00-LGAT-select-pathology-dx that is used for the pathology diagnosis
+# that will used to recode subtype
+path_dx_list <- jsonlite::fromJSON(
+  file.path(subset_dir, 
+            "lgat_subtyping_path_dx_strings.json")
+)
+
 # clinical file
 lgat_manifest <- read_tsv(file.path(root_dir, 
                                     "data",
                                     "pbta-histologies-base.tsv"), 
                           guess_max = 10000) %>%
   filter(Kids_First_Biospecimen_ID %in% bsid_to_include) %>%
   # select columns for interest
-  select(Kids_First_Biospecimen_ID,
+  dplyr::select(Kids_First_Biospecimen_ID,
          Kids_First_Participant_ID,
          sample_id,
          experimental_strategy,
-         cancer_predispositions)
+         cancer_predispositions,
+         pathology_diagnosis,
+         pathology_free_text_diagnosis)
 
 lgat_dna <- lgat_manifest %>%
   filter(experimental_strategy != "RNA-Seq") %>%
-  select( -experimental_strategy, -cancer_predispositions)
+  dplyr::select( -experimental_strategy, -cancer_predispositions)
 
 lgat_rna <- lgat_manifest %>%
   filter(experimental_strategy == "RNA-Seq") %>%
-  select( -experimental_strategy, -cancer_predispositions)
+  dplyr::select( -experimental_strategy, -cancer_predispositions)
 
 all_lgat_matched <- full_join(lgat_dna, lgat_rna,
                          by = c("Kids_First_Participant_ID",
-                                "sample_id"),
+                                "sample_id",
+                                "pathology_free_text_diagnosis",
+                                "pathology_diagnosis"),
                          suffix = c("_DNA", "_RNA")) %>%
-  select(starts_with("Kids_First"), sample_id)
+  dplyr::select(starts_with("Kids_First"), sample_id,
+                pathology_free_text_diagnosis,
+                pathology_diagnosis)
 
 # SNV based annotation
 lgat_snv <- read_tsv(file.path(subset_dir,"LGAT_snv_subset.tsv")) 
 
 # CNV based annotation
 lgat_cnv <- read_tsv(file.path(subset_dir,"LGAT_cnv_subset.tsv")) %>%
   # To-do add to CNV subset script
-  rename(Kids_First_Biospecimen_ID=lgat_wgs_biospecimen_ids)
+  dplyr::rename(Kids_First_Biospecimen_ID=lgat_wgs_biospecimen_ids)
 
 # Fusion base annotation
 lgat_fus <- read_tsv(file.path(subset_dir,"LGAT_fusion_subset.tsv")) 
@@ -91,7 +104,7 @@ This subtype is characterized by germline NF1 variants. We do not have germline
 nf1_snv <- lgat_snv %>%
   filter(NF1_mut=="Yes") %>%
   mutate(molecular_subtype = "NF1-somatic") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 nf1_germline <- lgat_manifest %>% 
   # to keep non-RNA files only since we will match the RNA 
@@ -101,7 +114,7 @@ nf1_germline <- lgat_manifest %>%
   filter(cancer_predispositions %in% c("NF-1",
                                        "NF-1,Other inherited conditions NOS")) %>%
   mutate(molecular_subtype = "NF1-germline") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 nf1_biospecimen <- bind_rows(nf1_snv,
                              nf1_germline) %>%
@@ -120,7 +133,7 @@ This subtype contains KIAA1549-BRAF fusion
 kiaabraf_biospecimen <- lgat_fus %>%
   filter(KIAA_BRAF_fus=="Yes") %>%
   mutate(molecular_subtype = "KIAA1549-BRAF") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 ```
 
@@ -132,7 +145,7 @@ This subtype contains BRAF V600E or V599 SNV or non-canonical BRAF SNV/indel ove
 brafV600_biospecimen <- lgat_snv %>%
   filter(BRAF_V600E_mut=="Yes") %>%
   mutate(molecular_subtype = "BRAF V600E") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 ```
 
@@ -146,12 +159,12 @@ OR contains KRAS, NRAS, HRAS, MAP2K1, MAP2K2, MAP2K1, ARAF SNV or indel
 otherMAPK_fus_biospecimen <- lgat_fus %>%
   filter(MAPK_fus=="Yes") %>%
   mutate(molecular_subtype = "other MAPK") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype")
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype")
 
 otherMAPK_snv_biospecimen <- lgat_snv %>%
   filter(MAPK_mut=="Yes") %>%
   mutate(molecular_subtype = "other MAPK") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 
 ```
@@ -167,12 +180,12 @@ OR harbors a PDGFRA SNV or fusion
 rtk_fus_biospecimen <- lgat_fus %>%
   filter(RTK_fus=="Yes") %>%
   mutate(molecular_subtype = "RTK") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 rtk_snv_biospecimen <- lgat_snv %>%
   filter(RTK_mut=="Yes") %>%
   mutate(molecular_subtype = "RTK") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 
 
@@ -188,17 +201,17 @@ OR harbors FGFR1 or FGFR2 fusions
 fgfr_fus_biospecimen <- lgat_fus %>%
   filter(FGFR_fus=="Yes") %>%
   mutate(molecular_subtype = "FGFR") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 fgfr_snv_biospecimen <- lgat_snv %>%
   filter(FGFR_mut=="Yes") %>%
   mutate(molecular_subtype = "FGFR") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 fgfr_cnv_biospecimen <- lgat_cnv %>%
   filter(FGFR_DUP_TANDEM=="Yes" |  FGFR_DUP == "Yes") %>%
   mutate(molecular_subtype = "FGFR") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 ```
 
@@ -209,7 +222,7 @@ This subtype harbors an IDH R132 mutation
 idh_biospecimen <- lgat_snv %>%
   filter(IDH_mut=="Yes") %>%
   mutate(molecular_subtype = "IDH") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 ```
 
@@ -220,7 +233,7 @@ This subtype harbors an H3F3A K28M or G35R/V mutation
 h3_biospecimen <- lgat_snv %>%
   filter(H3.1_mut =="Yes"| H3.2_mut =="Yes" |H3.3_mut =="Yes") %>%
   mutate(molecular_subtype = "H3") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 
 ```
 
@@ -231,7 +244,7 @@ This subtype harbors either a MYB-QKI fusion or other MYB or MYBL1 fusion
 myb_biospecimen <- lgat_fus %>%
   filter(MYB_fus=="Yes") %>%
   mutate(molecular_subtype = "MYB/MYBL1") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype")
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype")
 ```
 
 ### LGG, CDKN2A/B
@@ -243,7 +256,7 @@ This is a secondary co-occurring alteration with prognostic significance.
 cdkn_biospecimen <- lgat_cnv %>%
   filter(CDKN2A_DEL=="Yes"| CDKN2B_DEL == "Yes") %>%
   mutate(molecular_subtype = "CDKN2A/B") %>%
-  select("Kids_First_Biospecimen_ID","molecular_subtype") 
+  dplyr::select("Kids_First_Biospecimen_ID","molecular_subtype") 
 ```
 
 ### Compile
@@ -259,7 +272,7 @@ all_dna_subtype <- bind_rows(nf1_biospecimen,
                          h3_biospecimen,
                          cdkn_biospecimen
 ) %>%
-  rename(Kids_First_Biospecimen_ID_DNA=Kids_First_Biospecimen_ID) %>%
+  dplyr::rename(Kids_First_Biospecimen_ID_DNA=Kids_First_Biospecimen_ID) %>%
   # add Kids_First_Biospecimen_ID_RNA to dna subtypes dataframe
   left_join(all_lgat_matched,by="Kids_First_Biospecimen_ID_DNA") 
 
@@ -268,7 +281,7 @@ all_rna_subtype <- bind_rows(kiaabraf_biospecimen,
                              rtk_fus_biospecimen,
                              fgfr_fus_biospecimen,
                              myb_biospecimen) %>%
-  rename(Kids_First_Biospecimen_ID_RNA=Kids_First_Biospecimen_ID) %>%
+  dplyr::rename(Kids_First_Biospecimen_ID_RNA=Kids_First_Biospecimen_ID) %>%
   # add Kids_First_Biospecimen_ID_DNA to rna subtypes dataframe
   left_join(all_lgat_matched,by="Kids_First_Biospecimen_ID_RNA")
 
@@ -277,11 +290,18 @@ all_subtype <- bind_rows(all_rna_subtype,
                          all_dna_subtype) %>%
   full_join(all_lgat_matched, by=c("Kids_First_Biospecimen_ID_RNA","Kids_First_Biospecimen_ID_DNA",
                                    "sample_id",
+                                   "pathology_free_text_diagnosis",
+                                   "pathology_diagnosis",
                                    "Kids_First_Participant_ID")) %>%
   # lets group because there are some instances where LGAT DNA/RNA have multiple subtypes
-  group_by(sample_id,Kids_First_Participant_ID,Kids_First_Biospecimen_ID_RNA,Kids_First_Biospecimen_ID_DNA) %>%
+  group_by(sample_id,Kids_First_Participant_ID,
+           Kids_First_Biospecimen_ID_RNA,
+           Kids_First_Biospecimen_ID_DNA,
+           pathology_free_text_diagnosis,
+           pathology_diagnosis) %>%
   # merging them here with summarise
   summarise(molecular_subtype= toString(molecular_subtype)) %>%
+  ungroup() %>% 
   # formatting molecular_subtype columns
   dplyr::mutate(molecular_subtype = case_when(
     # molecular_subtype is NA and we didn't find any canonical
@@ -296,6 +316,14 @@ all_subtype <- bind_rows(all_rna_subtype,
                                   is.na(Kids_First_Biospecimen_ID_RNA))  ~ "LGG, To be classified",
     # when molecular_subtype is not NA 
     TRUE ~ stringr::str_c("LGG, ", molecular_subtype))) %>%
+  # recode if glialneuronal tumors
+  dplyr::mutate(molecular_subtype = case_when(str_detect(pathology_free_text_diagnosis,
+                                                       paste(path_dx_list$recode_path_free_text,collapse = "|")) &
+                                                pathology_diagnosis == "Low-grade glioma/astrocytoma (WHO grade I/II)" ~ 
+                                               gsub("LGG,","GNT,",molecular_subtype),
+                                             TRUE ~ molecular_subtype)) %>%
+  dplyr::select(-pathology_free_text_diagnosis,
+                -pathology_diagnosis) %>%
   unique() %>%
   dplyr::arrange(sample_id) %>%
   write_tsv(file.path(resultDir,"lgat_subtyping.tsv"))