-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfusion_driver.R
152 lines (137 loc) · 7.61 KB
/
fusion_driver.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#' Filter standardized fusion calls for driver fusions
#'
#' If standardized fusion calls are annotated using the geneListReferenceDataTab and fusionReferenceDataTab filters out fusion calls where partner genes are not annotated.
#' If standardized fusion is not annotated it will be annotated with geneListReferenceDataTab and fusionReferenceDataTab provided.
#' Domain retention status for Gene1A and Gene1B for the given pfamIDs is also annotated; defaults to kinase domain retention status information
#'
#' @param standardFusioncalls A dataframe from star fusion or arriba (more callers to be added)
#' @param filterPutativeDriver filter out fusion calls where partner genes are not annotated from with gene and fusion reference list by annnoFuse::annotate_fusion_calls()
#' @param annotated Logical value to specify if input if annotated by annnoFuse::annotate_fusion_calls()
#' @param geneListReferenceDataTab A dataframe with column 1 as GeneName 2 source file 3 type; collapse to summarize type
#' @param fusionReferenceDataTab A dataframe with column 1 as FusionName 2 source file 3 type; collapse to summarize type
#' @param checkDomainStatus Logical value to check if domain status in fused gene for given domansToCheck, default to FALSE
#' @param domainsToCheck pfamID to check for retention status, the IDs can be found here http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/pfamDesc.txt.gz, defaults to using kinase pfam IDs since the fusions with kinase domain are more relevant for therapy
#'
#'
#' @export
#'
#' @return Putative Driver standardized fusion calls annotated with gene list and
#' fusion list provided in reference folder. If checkDomainStatus == TRUE and
#' domain retention status for given pfamID is also provided along with the gene
#' location corresponding to the domain retention status
#'
#' @examples
#' out_annofuse <-
#' system.file("extdata", "PutativeDriverAnnoFuse.tsv", package = "annoFuseData")
#' sfc <- read.delim(out_annofuse)
#' geneListReferenceDataTab <- read.delim(
#' system.file("extdata", "genelistreference.txt", package = "annoFuseData"),
#' stringsAsFactors = FALSE
#' )
#' fusionReferenceDataTab <- read.delim(
#' system.file("extdata", "fusionreference.txt", package = "annoFuseData"),
#' stringsAsFactors = FALSE
#' )
#'
#' bioMartDataPfam <-
#' readRDS(system.file("extdata", "pfamDataBioMart.RDS", package = "annoFuseData"))
#' kinaseid <- unique(bioMartDataPfam$pfam_id[grep("kinase", bioMartDataPfam$NAME)])
#' fusion_driver_df <- fusion_driver(sfc,
#' annotated = TRUE,
#' geneListReferenceDataTab = geneListReferenceDataTab,
#' fusionReferenceDataTab = fusionReferenceDataTab,
#' checkDomainStatus = FALSE,
#' domainsToCheck = kinaseid
#' )
fusion_driver <- function(standardFusioncalls,
filterPutativeDriver = TRUE,
annotated = TRUE,
geneListReferenceDataTab,
fusionReferenceDataTab,
checkDomainStatus = FALSE,
domainsToCheck) {
standardFusioncalls <- .check_annoFuse_calls(standardFusioncalls)
stopifnot(is.logical(annotated))
stopifnot(is.logical(checkDomainStatus))
if (checkDomainStatus) {
# load bioMart Pfam dataframe
bioMartDataPfam <- readRDS(system.file("extdata", "pfamDataBioMart.RDS", package = "annoFuseData"))
if (missing(domainsToCheck)) {
message("domainsToCheck was not provided; Using default kinase pfam ids to checkDomainStatus")
domainsToCheck <- unique(bioMartDataPfam$pfam_id[grep("kinase", bioMartDataPfam$NAME)])
kinaseDomainRetained <- TRUE
}
if (!is.character(domainsToCheck) | !(any(domainsToCheck %in% bioMartDataPfam$pfam_id))) {
message("domainsToCheck was not character types or not found in pfam; Using default kinase pfam ids to checkDomainStatus")
domainsToCheck <- unique(bioMartDataPfam$pfam_id[grep("kinase", bioMartDataPfam$NAME)])
kinaseDomainRetained <- TRUE
}
if (is.character(domainsToCheck)) {
found <- paste(intersect(domainsToCheck, bioMartDataPfam$pfam_id), collapse = ",")
warning(paste(found, " pfamIDs were found in our pfam list"))
}
bioMartDataPfam <- bioMartDataPfam %>%
# keep only pfamIDs to check
dplyr::filter(pfam_id %in% domainsToCheck)
# annotate by domain ;gather partial retention as well
annDomain <- get_Pfam_domain(standardFusioncalls = standardFusioncalls, bioMartDataPfam = bioMartDataPfam, keepPartialAnno = TRUE)
# domain annotation
standardFusionGene1ADomain <- annDomain$Gene1A %>%
dplyr::rename("DomainRetainedGene1A" = "Gene1A_DOMAIN_RETAINED_IN_FUSION") %>%
# mutated Left and Right Breakpoints since the get_Pfam_domain separtaes the
# chromosome and genomic location into LeftBreakpointChr,LeftBreakpoint
# and RightBreakpointChr,RightBreakpoint
# but since format for standardFusioncalls is Chr:Breakpont we are updating here
# for merging in the next step
dplyr::mutate(
LeftBreakpoint = paste(LeftBreakpointChr, LeftBreakpoint, sep = ":"),
RightBreakpoint = paste(RightBreakpointChr, RightBreakpoint, sep = ":")
) %>%
# select only columns required
dplyr::select("LeftBreakpoint", "RightBreakpoint", "FusionName", "Fusion_Type", "Gene1A", "Sample", "DomainRetainedGene1A") %>%
unique()
standardFusionGene1BDomain <- annDomain$Gene1B %>%
dplyr::rename("DomainRetainedGene1B" = "Gene1B_DOMAIN_RETAINED_IN_FUSION") %>%
# mutated Left and Right Breakpoints since the get_Pfam_domain separtaes the
# chromosome and genomic location into LeftBreakpointChr,LeftBreakpoint
# and RightBreakpointChr,RightBreakpoint
# but since format for standardFusioncalls is Chr:Breakpont we are updating here
# for merging in the next step
dplyr::mutate(
LeftBreakpoint = paste(LeftBreakpointChr, LeftBreakpoint, sep = ":"),
RightBreakpoint = paste(RightBreakpointChr, RightBreakpoint, sep = ":")
) %>%
# select only columns required
dplyr::select("LeftBreakpoint", "RightBreakpoint", "FusionName", "Fusion_Type", "Sample", "Gene1B", "DomainRetainedGene1B") %>%
unique()
standardFusionDomain <- full_join(standardFusionGene1ADomain, standardFusionGene1BDomain)
if (exists("kinaseDomainRetained")) {
# default to always print kinase domain retention
standardFusionDomain <- standardFusionDomain %>%
dplyr::rename(
"kinaseDomainRetainedGene1A" = "DomainRetainedGene1A",
"kinaseDomainRetainedGene1B" = "DomainRetainedGene1B"
)
}
standardFusioncalls <- standardFusioncalls %>%
# add status for given pfamIDs
left_join(standardFusionDomain) %>%
unique()
}
if (!annotated) {
# check reference input
stopifnot(is(geneListReferenceDataTab, "data.frame"))
stopifnot(is(fusionReferenceDataTab, "data.frame"))
# annotate
standardFusioncalls <- annotate_fusion_calls(standardFusioncalls = standardFusioncalls, geneListReferenceDataTab = geneListReferenceDataTab, fusionReferenceDataTab = fusionReferenceDataTab)
}
if (filterPutativeDriver) {
standardFusioncalls <- standardFusioncalls %>%
# dplyr::filter(!Gene1A %in% fusion_recurrent5_per_sample$GeneSymbol |
# !Gene2A %in% fusion_recurrent5_per_sample$GeneSymbol |
# !Gene1B %in% fusion_recurrent5_per_sample$GeneSymbol |
# !Gene2B %in% fusion_recurrent5_per_sample$GeneSymbol) %>%
dplyr::filter(!is.na(.data$Gene1A_anno) | !is.na(.data$Gene1B_anno) | !is.na(.data$Gene2A_anno) | !is.na(.data$Gene2B_anno))
}
return(standardFusioncalls)
}