-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFigureS1.R
140 lines (120 loc) · 5.7 KB
/
FigureS1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#------------------------------------------------------------------------------#
# #
# Figure S1 #
# #
#------------------------------------------------------------------------------#
library(here)
library(wesanderson)
library(tidyverse)
library(GenomicRanges)
# Load paths to data files
source(here('helper_data_file_locations.R'))
# Load ggplot2
source(here('helper_ggplot2_settings.R'))
#------------------------------------------------------------------------------#
# Panel A #
# Histogram of distances between consecutive SNPs #
#------------------------------------------------------------------------------#
SNPs <- read_tsv(SNPs)
SNPs <- read_tsv('~/Downloads/S288c_v_SK1.snp')
# Keep only SK1 chromosomes and order table by chromosome and position
SNPs <- SNPs[str_detect(SNPs$chr, '_SK1'), ]
message(nrow(SNPs), ' annotated SNPs')
SNPs <- SNPs[order(SNPs$chr, SNPs$position), ]
SNPs$chr <- factor(SNPs$chr, levels=paste0('chr', as.roman(1:16), '_SK1'))
SNPs <- SNPs[order(SNPs$chr), ]
# Compute distances between consecutive SNPs
SNPs <- SNPs %>% group_by(chr) %>% mutate(Distance = c(NA, diff(position)))
mean_value <- mean(SNPs$Distance, na.rm=TRUE)
median_value <- median(SNPs$Distance, na.rm=TRUE)
p <- ggplot(SNPs, aes(Distance)) +
geom_histogram(aes(y= ..count..), bins=50, colour='grey30',
alpha=0.25, na.rm = T) +
xlim(-5, 500) +
#geom_vline(xintercept = mean_value, color = 'red') +
geom_segment(aes(x=mean_value, y=0, xend=mean_value, yend=3000),
linetype='dotted', color='red', size=0.4) +
#geom_vline(xintercept = median_value, color = 'blue') +
geom_segment(aes(x=median_value, y=0, xend=median_value, yend=5000),
linetype='dotted', color='red', size=0.4) +
labs(title='', x='Distance (bp)') +
annotate('text', x=mean_value, y=4000,
# fontface='bold',
label=paste0("mean\n(", round(mean_value, 0), ")")) +
annotate('text', x=median_value, y=6000,
# fontface='bold',
label=paste0("median\n(", round(median_value, 0), ")")) +
annotate('text', x=350, y=6000,
# fontface='bold',
label=paste0('(n = ', nrow(SNPs), ')'))
# Generated pdf is extemely large after importing to vector graphics software
# (large number of data points)
# Alternative: get the histogram summary table and then build plot
d <- ggplot_build(p)$data[[1]]
hist <- data.frame(x=d$x,
y=d$y)
ggplot(hist, aes(x, y)) +
geom_bar(stat='identity', colour='grey30', alpha=0.25, na.rm = T) +
geom_segment(aes(x=mean_value, y=0, xend=mean_value, yend=2800),
linetype='dotted', color='red', size=0.4) +
geom_segment(aes(x=median_value, y=0, xend=median_value, yend=4800),
linetype='dotted', color='red', size=0.4) +
labs(title='', x='Distance (bp)', y='Count') +
annotate('text', x=mean_value, y=4500,
label=paste0("mean\n(", round(mean_value, 0), ")")) +
annotate('text', x=median_value, y=6500,
label=paste0("median\n(", round(median_value, 0), ")")) +
annotate('text', x=350, y=7000,
label=paste0('(n SNPs = ', nrow(SNPs), ')'))
#------------------------------------------------------------------------------#
# Panel B #
# Histogram of distances between consecutive SNPs for each chromosome #
#------------------------------------------------------------------------------#
mean_values <- SNPs %>% group_by(chr) %>%
summarise(Mean=mean(Distance, na.rm = TRUE))
median_values <- SNPs %>% group_by(chr) %>%
summarise(Median=median(Distance, na.rm = TRUE))
### The SNP list generated using Mauve has some problems, particularly large
### numbers of SNPs detected in short, non-homologous regions; these
### artificially inflate the SNP distance stats on chrI
# Remove selected non-homologous regions on chrI before plotting
regions <- list(
c(200000, 208500),
c(32000, 34000)
)
length <- 0
for (region in regions) {
length <- length + (region[2] - region[1])
}
message('Total length removed from chrI: ', length / 1000, ' kb')
SNPs_homol <- SNPs
for (region in regions) {
SNPs_homol <- subset(
SNPs_homol,
!(chr == 'chrI_SK1' & position > region[1] & position < region[2]))
}
nrow(SNPs)
nrow(SNPs_homol)
mean_values <- SNPs_homol %>% group_by(chr) %>%
summarise(Mean=mean(Distance, na.rm = TRUE))
median_values <- SNPs_homol %>% group_by(chr) %>%
summarise(Median=median(Distance, na.rm = TRUE))
drop_genome <- function(string) str_replace(string, '_SK1', '')
ggplot(SNPs_homol, aes(Distance)) +
geom_histogram(aes(y= ..count..), bins=50, colour='grey30',
alpha=0.25, na.rm = T) +
facet_wrap( ~ chr, ncol=4, labeller=as_labeller(drop_genome)) +
xlim(-5, 500) +
geom_segment(aes(x=Mean, y=0, xend=Mean, yend=700), data=mean_values,
linetype='dotted', color='red', size=0.4) +
geom_text(aes(x=Mean+80, y=800, label=paste0("mean (", round(Mean, 0), ")")),
data=mean_values, size=3) +
geom_segment(aes(x=Median, y=0, xend=Median, yend=1000), data=median_values,
linetype='dotted', color='red', size=0.4) +
geom_text(aes(x=Median+80, y=1100,
label=paste0("median (", round(Median, 0), ")")),
data=median_values, size=3) +
labs(title='', x='Distance (bp)', y='Count') +
theme(
strip.background=element_blank()
)