Update tp53 score for basesubtyping (#1037)

* update run script for subtyping * revamped for subtyping Co-authored-by: Jo Lynne Rokita <jolynnerokita@d3b.center>
AlexsLemonade · Apr 27, 2021 · f125d2f · f125d2f
1 parent 35384c7
commit f125d2f
Show file tree

Hide file tree

Showing 11 changed files with 210 additions and 168 deletions.
diff --git a/analyses/tp53_nf1_score/01-apply-classifier.py b/analyses/tp53_nf1_score/01-apply-classifier.py
@@ -60,11 +60,10 @@
 rownamesRDS = robjects.r["rownames"]
 
 # Load gene expression data in rds format
-file = os.path.join("data", inputfile)
+name = os.path.basename(inputfile)
+name = re.sub("\.rds$", "", name)
 
-name = re.sub("\.rds$", "", inputfile)
-
-exprs_rds = readRDS(file)
+exprs_rds = readRDS(inputfile)
 exprs_df = pandas2ri.ri2py(exprs_rds)
 exprs_df.index = rownamesRDS(exprs_rds)
 
@@ -84,7 +83,7 @@
 
 # Load RAS Classifier
 file = os.path.join(
-    "analyses", "tp53_nf1_score", "reference", "ras_classifier_coefficients.tsv"
+    "reference", "ras_classifier_coefficients.tsv"
 )
 ras_coef_df = pd.read_table(file, index_col=0)
 ras_coef_df = ras_coef_df.query("abs_val > 0")
@@ -119,7 +118,7 @@
 
 # Load TP53 Classifier
 file = os.path.join(
-    "analyses", "tp53_nf1_score", "reference", "tp53_classifier_coefficients.tsv"
+    "reference", "tp53_classifier_coefficients.tsv"
 )
 tp53_coef_df = pd.read_table(file, index_col=0)
 tp53_coef_df = tp53_coef_df.query("abs_val > 0")
@@ -154,7 +153,7 @@
 
 # Load NF1 Classifier
 file = os.path.join(
-    "analyses", "tp53_nf1_score", "reference", "nf1_classifier_coefficients.tsv"
+    "reference", "nf1_classifier_coefficients.tsv"
 )
 nf1_coef_df = pd.read_table(file, index_col=0)
 nf1_coef_df = nf1_coef_df.query("abs_val > 0")
@@ -210,5 +209,5 @@
 filename = pd.Series([name, "classifier_scores.tsv"])
 filename = filename.str.cat(sep="_")
 print(filename)
-file = os.path.join("analyses", "tp53_nf1_score", "results", filename)
+file = os.path.join("results", filename)
 all_results.to_csv(file, sep="\t", index=False)
diff --git a/analyses/tp53_nf1_score/02-qc-rna_expression_score.Rmd b/analyses/tp53_nf1_score/02-qc-rna_expression_score.Rmd
@@ -2,7 +2,11 @@
 title: "QC correlation of classifier score with RNA-Seq expression"
 author: "K S Gaonkar (D3B)"
 output: html_notebook
-
+params:
+  base_run:
+    label: "1/0 to run with base histology"
+    value: 0
+    input: integer
 ---
 
 In this notebook we will find correlation between RNA-Seq and classifier score
@@ -20,7 +24,11 @@ root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
 data_dir <- file.path(root_dir, "data")
 results_dir <- "results"
 
-clinical <- read_tsv(file.path(data_dir,"pbta-histologies.tsv"))
+if ( params$base_run ==0 ){
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies.tsv"), stringsAsFactors = FALSE)
+} else{
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies-base.tsv"), stringsAsFactors = FALSE)  
+}
 ```
 
 ### Classifier scores

diff --git a/analyses/tp53_nf1_score/02-qc-rna_expression_score.nb.html b/analyses/tp53_nf1_score/02-qc-rna_expression_score.nb.html
diff --git a/analyses/tp53_nf1_score/03-tp53-cnv-loss-domain.Rmd b/analyses/tp53_nf1_score/03-tp53-cnv-loss-domain.Rmd
@@ -2,7 +2,11 @@
 title: "Find CNV losses that overlap with TP53 domains"
 author: "K S Gaonkar (D3B)"
 output: html_notebook
-
+params:
+  base_run:
+    label: "1/0 to run with base histology"
+    value: 0
+    input: integer
 ---
 
 In this script we will find CNV losses that overlap with TP53 domains:
@@ -58,8 +62,13 @@ bioMartDataPfamTp53 <-
   dplyr::filter(hgnc_symbol=="TP53")
 
 # histology file
-histologies_df <- read_tsv(file.path(data_dir,
-                                     "pbta-histologies.tsv")) %>%
+if ( params$base_run ==0 ){
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies.tsv"), stringsAsFactors = FALSE)
+} else{
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies-base.tsv"), stringsAsFactors = FALSE)  
+}
+
+histologies_df <- clinical %>%
   dplyr::select("Kids_First_Biospecimen_ID",
                 "sample_id")
 

diff --git a/analyses/tp53_nf1_score/03-tp53-cnv-loss-domain.nb.html b/analyses/tp53_nf1_score/03-tp53-cnv-loss-domain.nb.html
diff --git a/analyses/tp53_nf1_score/04-tp53-sv-loss.Rmd b/analyses/tp53_nf1_score/04-tp53-sv-loss.Rmd
@@ -2,7 +2,11 @@
 title: "Find CNV losses that overlap with TP53 domains"
 author: "K S Gaonkar (D3B)"
 output: html_notebook
-
+params:
+  base_run:
+    label: "1/0 to run with base histology"
+    value: 0
+    input: integer
 ---
 
 In this script we will find if there are a Structural Variant breakpoints within TP53 or covering the gene locus such that it is deleted and concurrently, there are either one or both of:
@@ -38,7 +42,13 @@ if (!dir.exists(results_dir)) {
 manta_sv <- read_tsv(file.path(data_dir, "pbta-sv-manta.tsv.gz"))
 stranded_expr <- readRDS(file.path(data_dir, "pbta-gene-expression-rsem-fpkm-collapsed.stranded.rds"))
 polya_expr <- readRDS(file.path(data_dir, "pbta-gene-expression-rsem-fpkm-collapsed.polya.rds"))
-histology <- read_tsv(file.path(data_dir, "pbta-histologies.tsv"))
+if ( params$base_run ==0 ){
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies.tsv"), stringsAsFactors = FALSE)
+} else{
+  clinical<-read.delim(file.path(data_dir,"pbta-histologies-base.tsv"), stringsAsFactors = FALSE)  
+}
+
+histology <- select(clinical,c("Kids_First_Biospecimen_ID", "sample_id"))
 
 # high confidence cnv losses from `03-tp53-cnv-loss-domain.Rmd`
 cnv_tp53_loss <- read_tsv(file.path(results_dir, "loss_overlap_domains_tp53.tsv"))
@@ -58,7 +68,7 @@ manta_sv_tp53 <- manta_sv %>%
       grepl("/TP53/", Gene.name),
     SV.type %in% c("BND", "INV", "DEL")
   ) %>%
-  left_join(select(histology, c("Kids_First_Biospecimen_ID", "sample_id")),
+  left_join(histology,
     by = c("Kids.First.Biospecimen.ID.Tumor" = "Kids_First_Biospecimen_ID")
   )
 
@@ -99,7 +109,7 @@ polya_expr_tp53 <- polya_expr["TP53", ] %>%
 
 lowexpr_tp53_with_sv <- bind_rows(stranded_expr_tp53, polya_expr_tp53) %>%
   # add sample_id to expression df
-  left_join(select(histology, c("Kids_First_Biospecimen_ID", "sample_id")),
+  left_join(histology,
     by = c("Kids_First_Biospecimen_ID")
   ) %>%
   # filter to keep only low expressed TP53 sample_ids