-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathBottleneck_size_estimation_approx.r
135 lines (105 loc) · 6.72 KB
/
Bottleneck_size_estimation_approx.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
library(tidyverse)
library(argparse)
# Handle command line arguments first
parser <- ArgumentParser()
parser$add_argument("--file", type="character", default = "./example_data/donor_and_recipient_freqs.txt" ,
help="file containing variant frequencies")
parser$add_argument("--plot_bool", type="logical", default= FALSE,
help="determines whether pdf plot approx_plot.pdf is produced or not")
parser$add_argument("--var_calling_threshold", type="double", default= 0.03,
help="variant calling threshold")
parser$add_argument("--Nb_min", type="integer", default= 1,
help="Minimum bottleneck value considered")
parser$add_argument("--Nb_max", type="integer", default= 200,
help="Maximum bottleneck value considered")
parser$add_argument("--Nb_increment", type="integer", default= 1,
help="increment between Nb values considered, i.e., all values considered will be multiples of Nb_increment that fall between Nb_min and Nb_max")
parser$add_argument("--confidence_level", type="double", default= .95,
help="Confidence level (determines bounds of confidence interval)")
args <- parser$parse_args()
# Now create necessary variables and dataframes
plot_bool <- args$plot_bool # determines whether or not a plot of log likelihood vs bottleneck size is produced
var_calling_threshold <- args$var_calling_threshold # variant calling threshold for frequency in recipient
Nb_min <- args$Nb_min # Minimum bottleneck size we consider
if(Nb_min < 1){Nb_min = 1} # preventing erros with Nb_min at 0 or lower
Nb_max <- args$Nb_max # Maximum bottlebeck size we consider
Nb_increment <- args$Nb_increment
confidence_level <- args$confidence_level # determines width of confidence interval
donor_and_recip_freqs_observed <- read.table(args$file) # table of SNP frequencies in donor and recipient
original_row_count <- nrow(donor_and_recip_freqs_observed) # number of rows in raw table
donor_and_recip_freqs_observed <- subset(donor_and_recip_freqs_observed, donor_and_recip_freqs_observed[, 1] >= var_calling_threshold)
donor_and_recip_freqs_observed[,1] <- as.numeric(donor_and_recip_freqs_observed[,1])
donor_and_recip_freqs_observed[,2] <- as.numeric(donor_and_recip_freqs_observed[,2])
donor_and_recip_freqs_observed <- subset(donor_and_recip_freqs_observed, donor_and_recip_freqs_observed[, 1] <= (1-var_calling_threshold))
new_row_count <- nrow(donor_and_recip_freqs_observed) # number of rows in filtered table
if(new_row_count != original_row_count )
{print("WARNING: Rows of the input file with donor frequency less than variant calling threshold have been removed during analysis. ")}
n_variants <- nrow(donor_and_recip_freqs_observed) # number of variants
freqs_tibble <- tibble(donor_freqs = donor_and_recip_freqs_observed[, 1], recip_freqs = donor_and_recip_freqs_observed[, 2] )
# Now implement the beta binomial algorithm
Log_Beta_Binom <- function(nu_donor, nu_recipient, NB_SIZE) # This function gives Log Likelihood for every donor recipient SNP frequency pair
{ LL_val_above <- 0 # used for recipient frequencies above calling threshold
LL_val_below <- 0 # used for recipient frequencies below calling threshold
nu_donor <- if_else(nu_recipient <=1- var_calling_threshold , nu_donor, 1-nu_donor )
nu_recipient <- if_else(nu_recipient <= 1-var_calling_threshold , nu_recipient, 1-nu_recipient )
for(k in 0:NB_SIZE){
LL_val_above <- LL_val_above + dbeta(nu_recipient, k, (NB_SIZE - k) )*dbinom(k, size=NB_SIZE, prob= nu_donor)
LL_val_below <- LL_val_below + pbeta(var_calling_threshold, k, (NB_SIZE - k))*dbinom(k,size=NB_SIZE, prob= nu_donor)
}
LL_val <- if_else(nu_recipient >= var_calling_threshold , LL_val_above, LL_val_below )
# We use LL_val_above above the calling threshold, and LL_val_below below the calling threshold
LL_val <- log(LL_val) # convert likelihood to log likelihood
return(LL_val)
}
LL_func_approx <- function(Nb_size){ # This function sums over all SNP frequencies in the donor and recipient
Total_LL <- 0
LL_array <- Log_Beta_Binom(freqs_tibble$donor_freqs, freqs_tibble$recip_freqs, Nb_size)
Total_LL <- sum(LL_array)
return(Total_LL)
}
# Now we define array of Log Likelihoods for all possible bottleneck sizes
bottleneck_values_vector <- c()
for ( i in Nb_min:Nb_max)
{ if(i%%Nb_increment == 0) {bottleneck_values_vector <- c(bottleneck_values_vector,i)}
}
LL_tibble <- tibble(bottleneck_size = bottleneck_values_vector, Log_Likelihood = 0*bottleneck_values_vector)
for(I in 1:nrow(LL_tibble) )
{LL_tibble$Log_Likelihood[I] <- LL_func_approx( LL_tibble$bottleneck_size[I] ) }
# Now we find the maximum likelihood estimate and the associated confidence interval
Max_LL <- max(LL_tibble$Log_Likelihood) # Maximum value of log likelihood
Max_LL_bottleneck_index <- which(LL_tibble$Log_Likelihood == max(LL_tibble$Log_Likelihood) ) # bottleneck size at which max likelihood occurs
Max_LL_bottleneck <- bottleneck_values_vector[Max_LL_bottleneck_index]
likelihood_ratio <- qchisq(confidence_level, df=1) # necessary ratio of likelihoods set by confidence level
ci_tibble <- filter(LL_tibble, 2*(Max_LL - Log_Likelihood) <= likelihood_ratio )
lower_CI_bottleneck <- min(ci_tibble$bottleneck_size) #-1 # lower bound of confidence interval
upper_CI_bottleneck <- max(ci_tibble$bottleneck_size) #+1# upper bound of confidence interval
#if ci_tibble is empty
if (length(ci_tibble$Log_Likelihood) == 0) {
lower_CI_bottleneck <- min(Max_LL_bottleneck)
upper_CI_bottleneck <- max(Max_LL_bottleneck)
}
if(max(Max_LL_bottleneck) ==max(bottleneck_values_vector))
{upper_CI_bottleneck <- max(bottleneck_values_vector)
print("Peak bottleneck value for MLE is at Nb_max (or largest possible value given Nb_increment)! Try raising Nb_max for better bottleneck estimate")
}
if(min(Max_LL_bottleneck) == min(bottleneck_values_vector))
{lower_CI_bottleneck <- min(bottleneck_values_vector)
if(min(bottleneck_values_vector) > 1){print("Peak bottleneck value for MLE is at Nb_min (or smallest possible value given Nb_increment)! Try lowering Nb_min for better bottleneck estimate")}
}
# now we plot our results
if(plot_bool == TRUE){
LL_tibble <- filter(LL_tibble, Log_Likelihood != -Inf)
ggplot(data = LL_tibble) + geom_point(aes(x = bottleneck_size, y= Log_Likelihood )) +
geom_vline(xintercept= Max_LL_bottleneck ) +
geom_vline(xintercept= lower_CI_bottleneck, color = "green" ) +
geom_vline(xintercept= upper_CI_bottleneck, color = "green" ) +
labs(x= "Bottleneck Size", y = "Log Likelihood" )
ggsave("approx_plot.jpg")
}
print("Bottleneck size")
if(length(Max_LL_bottleneck) > 1){print("MLE is degenerate. The best bottleneck values are")}
print(Max_LL_bottleneck)
print("confidence interval left bound")
print(lower_CI_bottleneck)
print("confidence interval right bound")
print(upper_CI_bottleneck)