AlexsLemonade · jaclyn-taroni · Nov 5, 2020 · Oct 21, 2020 · Oct 21, 2020 · Oct 21, 2020
diff --git a/analyses/fusion_filtering/04-project-specific-filtering.Rmd b/analyses/fusion_filtering/04-project-specific-filtering.Rmd
@@ -107,6 +107,10 @@ print("Raw calls from STARfusion and Arriba for PBTA")
 table(fusion_calls$Caller)
 ```
 
+### Aggregate 
+
+If FusionName is called by multiple callers we will have a column 'CalledBy'
+to specify that multiple callers "Caller1,Caller2" have called it
 
 ```{r}
 
@@ -121,8 +125,65 @@ fusion_caller.summary <- fusion_calls %>%
 #to add aggregated caller from fusion_caller.summary
 fusion_calls<-fusion_calls %>% 
   left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type"))) %>%
-  dplyr::select(-JunctionReadCount,-SpanningFragCount,-Confidence,-LeftBreakpoint,-RightBreakpoint) %>% unique()
+  unique()
+```
+
+### Idenitify kinase domain retention status
+
+Kinase domainIDs are obtained pfam by a simple grep "kinase" in their Name
+
+```{r}
+# identify kinase domain from bioMartPfam dataframe provided with annoFuse
+bioMartDataPfam <- readRDS(system.file("extdata", "pfamDataBioMart.RDS", package = "annoFuse"))
+
+# look for domains with "kinase" in their domain Name
+kinaseid<-unique(bioMartDataPfam[grep("kinase",bioMartDataPfam$NAME),
+                                 c("pfam_id","NAME")] ) 
 
+kinaseid
+```
+
+Through annoFuse::fusion_driver domain retention status for given kinase pfamID is being added per Gene1A (5' Gene) and Gene1B (3' Gene)
+```{r}
+fusion_calls <- annoFuse::fusion_driver(fusion_calls,
+                                  annotated = TRUE,
+                                  checkDomainStatus=TRUE,
+                                  # check status for given pfamID
+                                  domainsToCheck=kinaseid$pfam_id,
+                                  # we don't want to filter for putative driver fusion yet
+                                  filterPutativeDriver = FALSE
+                                  )
+```
+
+### Adding check to see if reciprocal fusion exists
+
+```{r}
+
+# check for fusions have reciprocal fusions in the same Sample
+# works only for GeneY -- GeneX ; GeneX -- GeneY matches
+reciprocal_fusion <- function(FusionName,Sample,standardFusioncalls ){
+  Gene1A <- strsplit(FusionName,"--")[[1]][1]
+  Gene1B <- strsplit(FusionName,"--")[[1]][2]
+  reciprocal <- paste0(Gene1B,"--",Gene1A)
+  check <- any(standardFusioncalls$FusionName[standardFusioncalls$Sample==Sample] == reciprocal)
+  df <- data.frame("FusionName"=FusionName,"Sample"=Sample,"reciprocal_exists"=check)
+}
+
+# run reciprocal_fusion function to get status of fusion if reciprocal
+is_reciprocal <- apply(fusion_calls,1,function(x) reciprocal_fusion(x["FusionName"],x["Sample"],fusion_calls))
+
+# convert list to dataframe
+is_reciprocal<-data.frame(Reduce(rbind, is_reciprocal))
+
+# merge to reciprocal status to fusion calls
+fusion_calls <- fusion_calls %>%
+  dplyr::left_join(is_reciprocal,by=c("Sample","FusionName"))
+
+```
+
+### Filter Putative Oncogene fusions
+
+``` {r}
 #merge with histology file
 fusion_calls<-merge(fusion_calls,clinical,by.x="Sample",by.y="Kids_First_Biospecimen_ID")
 
@@ -136,6 +197,7 @@ putative_driver_annotated_fusions <- fusion_calls %>%
 
 ```
 
+### Other non-Oncogenic fusions filtering
 
 ```{r}
 # remove local rearrangements
@@ -174,7 +236,7 @@ sample.count <- fusion_calls %>%
 
 ```
 
-
+#### Keep recurrent non-oncogenic fusions specific to a broad_histology
 
 ```{r}
 #filter QCGeneFiltered_filtFusion to keep recurrent fusions from above sample.count and fusion_calls.summary
@@ -185,8 +247,7 @@ QCGeneFiltered_recFusion<-fusion_calls %>%
 ```
 
 
-
-
+### Remove non-oncogenic fusions found in multiple histologies
 
 ```{r}
 # remove fusions that are in > numGroup
@@ -253,6 +314,3 @@ putative_driver_fusions<-rbind(QCGeneFiltered_recFusionUniq,putative_driver_anno
 write.table(putative_driver_fusions,file.path(root_dir,"scratch","pbta-fusion-putative-oncogenic-preQC.tsv"),sep="\t",quote=FALSE,row.names = FALSE)
 
 ```
-
-
-
diff --git a/analyses/fusion_filtering/04-project-specific-filtering.nb.html b/analyses/fusion_filtering/04-project-specific-filtering.nb.html
diff --git a/analyses/fusion_filtering/05-QC_putative_onco_fusion_dustribution.nb.html b/analyses/fusion_filtering/05-QC_putative_onco_fusion_dustribution.nb.html
diff --git a/analyses/fusion_filtering/README.md b/analyses/fusion_filtering/README.md
@@ -42,6 +42,7 @@ The code to generate genelistreference.txt and fusionreference.txt is available
 `03-Calc-zscore-annotate.R` : Calculates z-score for gene fused gene's expression compared to GTeX brain samples and annotates differential expression status
 
 `04-project-specific-filtering.Rmd` : Performs project specific filtering. We prioritize the fusions as putative-oncogenic fusions if any fused gene in the fusion is annotated as kinases, oncogenes, tumor suppressors, curated transcription factors or present in COSMIC Cancer Gene Census list. We also annotated fusions if they are present in TCGA fusions list.
+All fusion calls are additionally have columns `reciprocal_exists` to specify if within the Sample a fusion GeneX--GeneY has a reciprocal GeneY--GeneX . `DomainRetainedGene1A` and `DomainRetainedGene1B` are added to identify kinase domain retention for Gene1A (5` Gene) and Gene1B (3` Gene).
 Oncogene annotated fusions do not need to be in both callers to be retained however if these fusions are found in more than 4 histologies we treat them as false calls and remove them.
 To scavenge back non-oncogenic fusions that are recurrently found uniquely in a broad_histology we kept fusions that were called by both callers and if >2 samples per histology called the fusion.
 We removed the non-oncogenic fusions with genes fused more than 5 times in a samples or found in more than 1 histology as potential artifact.