forked from AlexsLemonade/OpenPBTA-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04-project-specific-filtering.Rmd
255 lines (183 loc) · 7.26 KB
/
04-project-specific-filtering.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
---
title: "Project specific filtering"
output: html_notebook
params:
histology:
label: "Clinical file"
value: data/pbta-histologies.tsv
input: file
group:
label: "Grouping variable"
value: broad_histology
input: string
dataStranded:
label: "Input filtered fusion dataframe"
value: scratch/standardFusionStrandedExp_QC_expression_filtered_annotated.RDS
input: file
dataPolya:
label: "Input filtered fusion dataframe"
value: scratch/standardFusionPolyaExp_QC_expression_filtered_annotated.RDS
input: file
numCaller:
label: "Least Number of callers to have called fusion"
value: 2
input: integer
numSample:
label: "Least Number of samples to have fusion per group"
value: 2
input: integer
numGroup:
label: "Max number of groups found in"
value: 1
input: integer
limitMultiFused:
label: "Max number of times gene can be fused per sample"
value: 5
input: integer
outputfolder:
label: "results folder for *tsv files"
value: results
input: string
---
K S Gaonkar
Filtered Fusions:
1. In-frame fusions is called in atleast 2 samples per histology OR
2. In-frame fusions is called in atleast 2 callers
AND
Filtered-fusions found in more than 1 histology OR
Filtered-fusion doesn't have multi-fused gene (more than 5 times in sample)
Putative Driver:
Filtering for general cancer specific genes
Fusions with genes in either onco
This notebook assumes you are in OpenPBTA-analysis project folder structure.
```{r}
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
#load required packages
suppressPackageStartupMessages(library("readr"))
suppressPackageStartupMessages(library("tidyverse"))
suppressPackageStartupMessages(library("reshape2"))
suppressPackageStartupMessages(library("qdapRegex"))
#read filtFusion files
strandedQCGeneFiltered_filtFusion<-readRDS(file.path(root_dir, params$dataStranded))
polyaQCGeneFiltered_filtFusion<-readRDS(file.path(root_dir, params$dataPolya))
# results folder
outputfolder<-params$outputfolder
QCGeneFiltered_filtFusion<-rbind(strandedQCGeneFiltered_filtFusion,polyaQCGeneFiltered_filtFusion)
write.table(QCGeneFiltered_filtFusion, file.path(outputfolder, "FilteredFusion.tsv"),sep="\t",quote=FALSE,row.names = FALSE)
# subset for recurrent fusion detection and multifused genes QC
fusion_calls<-unique(QCGeneFiltered_filtFusion)
# remove distance from intergenic fusions
fusion_calls$FusionName<-unlist(lapply(fusion_calls$FusionName,function(x) rm_between(x, "(", ")", extract = F)))
# get grouping column id
group<-params$group
# get histology file
clinical<-read.delim(file.path(root_dir, params$histology), stringsAsFactors = FALSE)
clinical<-clinical[,c("Kids_First_Biospecimen_ID","Kids_First_Participant_ID","broad_histology")]
# Least number of callers
numCaller<-params$numCaller
# Least number of samples per group
numSample<-params$numSample
# Max number of groups
numGroup<-params$numGroup
# Max number of times gene can be fused per sample
limitMultiFused<-params$limitMultiFused
print("Raw calls from STARfusion and Arriba for PBTA")
table(fusion_calls$Caller)
```
```{r}
# aggregate caller
fusion_caller.summary <- fusion_calls %>%
dplyr::select(Sample,FusionName,Caller,Fusion_Type) %>%
group_by(FusionName, Sample ,Fusion_Type) %>%
unique() %>%
dplyr::mutate(CalledBy = toString(Caller), caller.count = n()) %>%
dplyr::select(-Caller)
# remove fusion within local rearrangement
fusion_calls <- fusion_calls %>%
# remove local rearrangement/adjacent genes
dplyr::filter(!grepl("LOCAL_REARRANGEMENT|LOCAL_INVERSION",annots))
#to add aggregated caller from fusion_caller.summary
fusion_calls<-fusion_calls %>%
dplyr::select(-Caller,-annots) %>%
left_join(fusion_caller.summary,by=(c("Sample","FusionName","Fusion_Type"))) %>%
dplyr::select(-JunctionReadCount,-SpanningFragCount,-Confidence,-LeftBreakpoint,-RightBreakpoint) %>% unique()
#merge with histology file
fusion_calls<-merge(fusion_calls,clinical,by.x="Sample",by.y="Kids_First_Biospecimen_ID")
# filter for putative driver genes and mutifused genes per sample
putative_driver_annotated_fusions <- fusion_calls %>%
dplyr::filter(!is.na(Gene1A_anno) | !is.na(Gene1B_anno) | !is.na(Gene2A_anno) | !is.na(Gene2B_anno)) %>%
unique()
```
```{r}
# Gene fusion should be in-frame/frameshift
fusion_calls<-fusion_calls %>%
dplyr::filter(Fusion_Type != "other")
# AND
#
# 1. Called by at least n callers
fusion_calls.summary <- fusion_calls %>%
dplyr::filter(caller.count >= numCaller) %>%
unique() %>%
mutate(note=paste0("Called by",numCaller, "callers")) %>%
as.data.frame()
# OR
# 2. Found in at least n samples in each group
sample.count <- fusion_calls %>%
dplyr::filter(Fusion_Type != "other") %>%
dplyr::select(FusionName, Sample, !!as.name(group),-Fusion_Type) %>%
unique() %>%
group_by(FusionName, !!as.name(group)) %>%
dplyr::mutate(sample.count = n(),Sample = toString(Sample)) %>%
dplyr::filter(sample.count > numSample) %>%
unique() %>%
mutate(note=paste0("Found in atleast ",numSample, " samples in a group")) %>%
as.data.frame()
```
```{r}
#filter QCGeneFiltered_filtFusion to keep recurrent fusions from above sample.count and fusion_calls.summary
QCGeneFiltered_recFusion<-fusion_calls %>%
dplyr::filter(FusionName %in% unique(c(sample.count$FusionName,fusion_calls.summary$FusionName)))
```
```{r}
# remove fusions that are in > numGroup
group.count <- fusion_calls %>%
dplyr::select(FusionName, !!as.name(group)) %>%
unique() %>%
group_by(FusionName) %>%
dplyr::mutate(group.ct = n(),Sample = toString(!!(as.name(group)))) %>%
dplyr::filter(group.ct >numGroup)
# remove multi-fused genes
fusion_recurrent5_per_sample <- fusion_calls %>%
# We want to keep track of the gene symbols for each sample-fusion pair
dplyr::select(Sample, FusionName, Gene1A, Gene1B, Gene2A, Gene2B) %>%
# We want a single column that contains the gene symbols
tidyr::gather(Gene1A, Gene1B, Gene2A, Gene2B,
key = gene_position, value = GeneSymbol) %>%
# Remove columns without gene symbols
dplyr::filter(GeneSymbol != "") %>%
dplyr::arrange(Sample, FusionName) %>%
# Retain only distinct rows
dplyr::distinct() %>%
group_by(Sample,GeneSymbol) %>%
dplyr::summarise(Gene.ct = n()) %>%
dplyr::filter(Gene.ct>limitMultiFused) %>%
mutate(note=paste0("multfused " ,limitMultiFused, " times per sample"))
```
```{r}
# filter QCGeneFiltered_recFusion to remove fusions found in more than 1 group
recurrent_symbols <- fusion_recurrent5_per_sample$GeneSymbol
QCGeneFiltered_recFusionUniq<-QCGeneFiltered_recFusion %>%
dplyr::filter(!FusionName %in% group.count$FusionName) %>%
dplyr::filter(!Gene1A %in% recurrent_symbols |
!Gene2A %in% recurrent_symbols |
!Gene1B %in% recurrent_symbols |
!Gene2B %in% recurrent_symbols) %>%
unique()
```
```{r}
# merge putative annotated oncogenic and scavenged back non-oncogenic annotated, recurrent fusions
putative_driver_fusions<-rbind(QCGeneFiltered_recFusionUniq,putative_driver_annotated_fusions) %>%
unique() %>% dplyr::select (-broad_histology) %>%
as.data.frame()
write.table(putative_driver_fusions,file.path(outputfolder,"pbta-fusion-putative-oncogenic.tsv"),sep="\t",quote=FALSE,row.names = FALSE)
```