-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcall_significant.R
73 lines (60 loc) · 3.47 KB
/
call_significant.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env Rscript
## Rscript to run statistical stat following DipSeek run on the roadmap data
## To get help :
# Rscript --vanilla call_significant.R --help
##########
# Parser #
##########
library("optparse")
library("data.table")
library("tidyverse")
library("hexbin")
library("gridExtra")
option_list = list(
make_option(c("-v", "--valleys"), type="character", default=NULL,
help="input valley file", metavar="character"),
make_option(c("-p", "--peaks"), type="character", default=NULL,
help="input peak file", metavar="character"),
make_option(c("-o", "--out"), type="character", default="valleys_tested.out",
help="output file name [default= %default]", metavar="character"),
make_option(c("-d", "--diagnosis"), type="character", default="diagnosis.pdf",
help="output diagnosis plot name [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$valleys)){
print_help(opt_parser)
stop("Input file not provided.", call.=FALSE)
}
#############
# Functions #
#############
#######
# Run #
#######
# Load valley and peak data
valley_init <- fread(opt$valleys, header=FALSE, col.names = c('index','TF','cov','covaround','lead_covaround','lag_covaround','lead_cov','lag_cov','local_max_dist','peak'))
peak_init <- fread(opt$peaks, header=FALSE, col.names = c('peak','chromosome','span','stderr','roughness'))
df_init <- valley_init %>% left_join(peak_init, by = 'peak') %>% filter(local_max_dist > 200)
pdf(file = opt$diagnosis, onefile = TRUE, width = 15 , height = 7)
# Run diagnosis for span fitting
gspan1 <- ggplot(peak_init, aes(x = roughness, y = stderr)) + geom_hex(binwidth = c(0.1, 0.1)) + theme_light() + coord_fixed(ratio = 1)
gspan2 <- ggplot(peak_init, aes(x = roughness, y = span)) + geom_hex(binwidth = c(0.1, 0.05)) + theme_light()
gspan3 <- ggplot(peak_init, aes(x = stderr, y = span)) + geom_hex(binwidth = c(0.1, 0.05)) + theme_light()
lay <- matrix(c(1,2,1,3), byrow = TRUE, nrow = 2)
grid.arrange(arrangeGrob(gspan1),arrangeGrob(gspan2), arrangeGrob(gspan3), layout_matrix = lay)
# Run poisson test and diagnosis
df_init <- df_init %>% drop_na() %>% mutate(lambda = (lead_covaround + lag_covaround)/2, x = round(covaround), log2fc = log2(lambda/x))
df_init <- df_init %>% mutate(pvalue = ppois(x, lambda = lambda, lower.tail = TRUE), evalue = (pvalue * dim(df_init)[1]))
df_init <- df_init %>% mutate(pvalue = as.numeric(formatC(pvalue, format = "e", digits = 2)), evalue = as.numeric(formatC(evalue, format = "e", digits = 2)))
gval1 <- ggplot(df_init, aes(x = log2fc, y = pvalue)) + geom_hex() + theme_light() + scale_y_reverse()
gval2 <- ggplot(df_init, aes(x = pvalue)) + geom_histogram(binwidth = 0.05, color = 'white') + theme_light()
gval3 <- ggplot(df_init, aes(x = log2fc, y = log10(evalue))) + geom_hex() + theme_light() + scale_y_reverse()
gval4 <- ggplot(df_init, aes(x = log10(evalue))) + geom_histogram(color = 'white') + theme_light()
lay <- matrix(c(1,2), byrow = TRUE, nrow = 1)
grid.arrange(arrangeGrob(gval1),arrangeGrob(gval2), layout_matrix = lay)
grid.arrange(arrangeGrob(gval3),arrangeGrob(gval4), layout_matrix = lay)
# Output bed file of the retained valleys
df_out <- df_init %>% mutate(start = index - 100, end = index + 100, score = pvalue, qscore = evalue ) %>% select(chromosome, start, end, peak, score, qscore)
write.table(df_out, file = opt$out, col.names = FALSE, row.names = FALSE, sep = "\t", quote = FALSE)
dev.off()