diff --git a/.gitignore b/.gitignore
index b6782bf..5d394ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ vignettes/*.pdf
# knitr and R markdown default cache directories
*_cache/
/cache/
+inst/doc/
# Temporary files created by R markdown
*.utf8.md
@@ -48,7 +49,6 @@ po/*~
# RStudio Connect folder
rsconnect/
-inst/doc
/doc/
/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
index 3b56e8c..e43cc8f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -39,7 +39,13 @@ Suggests:
rmarkdown,
testthat (>= 3.0.0),
withr,
- mnormt
+ mnormt,
+ here,
+ patchwork,
+ deSolve,
+ cowplot,
+ mixtools,
+ MASS
VignetteBuilder: knitr
Config/testthat/edition: 3
LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
index 73a94af..ebea38f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,15 +1,16 @@
# Generated by roxygen2: do not edit by hand
+export(hmde_affine_de)
export(hmde_assign_data)
export(hmde_canham_de)
export(hmde_const_de)
export(hmde_extract_estimates)
-export(hmde_linear_de)
export(hmde_model)
export(hmde_model_des)
-export(hmde_model_name)
+export(hmde_model_names)
export(hmde_model_pars)
export(hmde_plot_de_pieces)
+export(hmde_plot_obs_est_inds)
export(hmde_run)
export(hmde_vb_de)
import(Rcpp)
diff --git a/R/hmde_assign_data.R b/R/hmde_assign_data.R
index 5a384f6..c976c9e 100644
--- a/R/hmde_assign_data.R
+++ b/R/hmde_assign_data.R
@@ -2,21 +2,28 @@
#'
#' @param model_template output from hmde_model
#' @param data Input data tibble with columns including time, y_obs, obs_index, and additionally ind_id for multi-individual models
-#' @param step_size Step size for numerical integration.
#' @param ... data-masking name-value pairs allowing specific input of elements
#'
#' @return updated named list with your data assigned to Stan model parameters
#' @export
-hmde_assign_data <- function(model_template, data = NULL, step_size = NULL, ...){
- if(!model_template$model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+hmde_assign_data <- function(model_template, data = NULL,...){
+ if(!model_template$model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
if(!is.null(data)){ # Use provided tibble
user_fields <- names(data)
+ user_code <- rlang::enquos(..., .check_assign = TRUE)
+ additional_user_fields <- names(user_code)
+ # Evaluate the RHS of expressions (the values)
+ additional_data <- purrr::map(user_code,
+ ~rlang::eval_tidy(.x, env = rlang::caller_env())
+ )
+
} else { # Grab user expressions from individual list items and extract data
+ additional_user_fields <- NULL
user_code <- rlang::enquos(..., .check_assign = TRUE)
user_fields <- names(user_code)
# Evaluate the RHS of expressions (the values)
@@ -45,16 +52,21 @@ hmde_assign_data <- function(model_template, data = NULL, step_size = NULL, ...)
}
for(i in model_fields){ # Iterate through required fields and fill them
- if(i %in% user_fields){
+ if(i %in% user_fields){ #Check if the user has supplied it in a tibble
model_template <- purrr::list_modify(model_template, !!!data[i])
- } else {
+
+ } else if(!is.null(additional_user_fields)){
+ if(i %in% additional_user_fields){ #Check if the user supplied it directly
+ model_template <- purrr::list_modify(model_template, !!!additional_data[i])
+ }
+ }
+
+ if(is.null(model_template[[i]])){ #Catches default tibble transformations
model_template[[i]] <- switch(
i,
n_obs = length(data$y_obs),
n_ind = length(unique(data$ind_id)),
- y_0_obs = data$y_obs[which(data$obs_index == 1)],
- y_bar = mean(data$y_obs),
- model = model_template$model
+ y_bar = mean(data$y_obs)
)
}
diff --git a/R/hmde_extract_estimates.R b/R/hmde_extract_estimates.R
index 46f21e4..df82d31 100644
--- a/R/hmde_extract_estimates.R
+++ b/R/hmde_extract_estimates.R
@@ -1,6 +1,5 @@
#' Extract samples and return measurement, individual, and population-level estimates
#'
-#' @param model model name character string
#' @param fit fitted model Stan fit
#' @param input_measurement_data data used to fit the model with ind_id, y_obs, time, obs_index tibble
#'
@@ -9,8 +8,7 @@
#' @import dplyr
#' @importFrom stats quantile
-hmde_extract_estimates <- function(model = NULL,
- fit = NULL,
+hmde_extract_estimates <- function(fit = NULL,
input_measurement_data = NULL){
#Check for fit
if(is.null(fit)){
@@ -21,9 +19,11 @@ hmde_extract_estimates <- function(model = NULL,
stop("Fit not S4 stanfit type.")
}
+ model <- fit@model_name
#Check for model
- if(!model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model %in% hmde_model_names()){
+ stop(paste0("Model name not recognised: ", model,
+ " Run hmde_model_names() to see available models."))
}
#Check for input measurement data
@@ -33,7 +33,7 @@ hmde_extract_estimates <- function(model = NULL,
}
}
- estimate_list <- list()
+ estimate_list <- list(model_name = model)
par_names <- hmde_model_pars(model)
if(grepl("multi", model)){ #Get n_ind for multi-individual
diff --git a/R/hmde_model_des.R b/R/hmde_model_des.R
index c11b569..9a78623 100644
--- a/R/hmde_model_des.R
+++ b/R/hmde_model_des.R
@@ -5,8 +5,8 @@
#' @export
hmde_model_des <- function(model = NULL){
- if(!model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
output <- switch(
@@ -17,7 +17,7 @@ hmde_model_des <- function(model = NULL){
canham_multi_ind = hmde_canham_de,
vb_single_ind = hmde_vb_de,
vb_multi_ind = hmde_vb_de,
- linear_single_ind = hmde_linear_de
+ affine_single_ind = hmde_affine_de
)
return(output)
@@ -65,15 +65,15 @@ hmde_vb_de <- function(y = NULL, pars = NULL){
)
}
-#' Differential equation for linear growth single individual model
+#' Differential equation for affine growth single individual model
#' @param y input real
#' @param pars list of parameters beta_0, beta_1
#'
#' @return value of differential equation at y
#' @export
-hmde_linear_de <- function(y = NULL, pars = NULL){
+hmde_affine_de <- function(y = NULL, pars = NULL){
return(
- pars[[1]] + pars[[2]] * y
+ pars[[1]] - pars[[2]] * y
)
}
diff --git a/R/hmde_model_names.R b/R/hmde_model_names.R
index 680c7c6..f532710 100644
--- a/R/hmde_model_names.R
+++ b/R/hmde_model_names.R
@@ -3,14 +3,14 @@
#' @return vector of character strings for model names.
#' @export
-hmde_model_name <- function(){
+hmde_model_names <- function(){
output <- c("constant_single_ind",
"constant_multi_ind",
"canham_single_ind",
"canham_multi_ind",
"vb_single_ind",
"vb_multi_ind",
- "linear_single_ind")
+ "affine_single_ind")
return(output)
}
diff --git a/R/hmde_model_pars.R b/R/hmde_model_pars.R
index c045d80..c6e9922 100644
--- a/R/hmde_model_pars.R
+++ b/R/hmde_model_pars.R
@@ -7,8 +7,8 @@
hmde_model_pars <- function(model=NULL){
- if(!model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
output <- switch(model,
@@ -18,7 +18,7 @@ hmde_model_pars <- function(model=NULL){
canham_multi_ind = hmde_canham_multi_ind_pars(),
vb_single_ind = hmde_vb_single_ind_pars(),
vb_multi_ind = hmde_vb_multi_ind_pars(),
- linear_single_ind = hmde_linear_single_ind_pars())
+ affine_single_ind = hmde_affine_single_ind_pars())
return(output)
}
@@ -95,13 +95,13 @@ hmde_vb_multi_ind_pars <- function(){
model = "vb_multi_ind")
}
-#' Parameter names for linear growth single individual model
+#' Parameter names for affine growth single individual model
#' @keywords internal
#' @noRd
#'
-hmde_linear_single_ind_pars <- function(){
+hmde_affine_single_ind_pars <- function(){
list(measurement_pars_names = c("y_hat"),
individual_pars_names = c("ind_beta_0", "ind_beta_1"),
- error_pars_names = c("global_error_sigma"),
- model = "linear_single_ind")
+ error_pars_names = c(NULL),
+ model = "affine_single_ind")
}
diff --git a/R/hmde_models.R b/R/hmde_models.R
index a5a23b6..afc9b66 100644
--- a/R/hmde_models.R
+++ b/R/hmde_models.R
@@ -7,8 +7,8 @@
hmde_model <- function(model=NULL){
- if(!model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
output <- switch(model,
@@ -18,7 +18,7 @@ hmde_model <- function(model=NULL){
canham_multi_ind = hmde_canham_multi_ind(),
vb_single_ind = hmde_vb_single_ind(),
vb_multi_ind = hmde_vb_multi_ind(),
- linear_single_ind = hmde_linear_single_ind())
+ affine_single_ind = hmde_affine_single_ind())
class(output) <- "hmde_object"
@@ -34,7 +34,8 @@ hmde_const_single_ind <- function(){
y_obs = NULL,
obs_index = NULL,
time = NULL,
- y_0_obs = NULL,
+ prior_pars_ind_beta = c(0, 2),
+ prior_pars_global_error_sigma = c(0, 2),
model = "constant_single_ind")
}
@@ -49,7 +50,9 @@ hmde_const_multi_ind <- function(){
obs_index = NULL,
time = NULL,
ind_id = NULL,
- y_0_obs = NULL,
+ prior_pars_pop_beta_mu = c(0,2),
+ prior_pars_pop_beta_sigma = c(0,2),
+ prior_pars_global_error_sigma = c(0,2),
model = "constant_multi_ind")
}
@@ -62,7 +65,10 @@ hmde_canham_single_ind <- function(){
y_obs = NULL,
obs_index = NULL,
time = NULL,
- y_0_obs = NULL,
+ prior_pars_ind_max_growth = c(0,2),
+ prior_pars_ind_size_at_max_growth = c(0,2),
+ prior_pars_ind_k = c(0,2),
+ prior_pars_global_error_sigma = c(0,2),
model = "canham_single_ind")
}
@@ -77,7 +83,13 @@ hmde_canham_multi_ind <- function(){
obs_index = NULL,
time = NULL,
ind_id = NULL,
- y_0_obs = NULL,
+ prior_pars_pop_max_growth_mean = c(0,2),
+ prior_pars_pop_max_growth_sd = c(0,2),
+ prior_pars_pop_size_at_max_growth_mean = c(0,2),
+ prior_pars_pop_size_at_max_growth_sd = c(0,2),
+ prior_pars_pop_k_mean = c(0,2),
+ prior_pars_pop_k_sd = c(0,2),
+ prior_pars_global_error_sigma = c(0,2),
model = "canham_multi_ind")
}
@@ -91,8 +103,10 @@ hmde_vb_single_ind <- function(){
y_obs = NULL,
obs_index = NULL,
time = NULL,
- y_0_obs = NULL,
y_bar = NULL,
+ prior_pars_ind_max_size_sd_only = 2,
+ prior_pars_ind_growth_rate = c(0,2),
+ prior_pars_global_error_sigma = c(0,2),
model = "vb_single_ind")
}
@@ -107,22 +121,28 @@ hmde_vb_multi_ind <- function(){
obs_index = NULL,
time = NULL,
ind_id = NULL,
- y_0_obs = NULL,
y_bar = NULL,
+ prior_pars_pop_max_size_mean_sd_only = 2,
+ prior_pars_pop_max_size_sd = c(0,2),
+ prior_pars_pop_growth_rate_mean = c(0,2),
+ prior_pars_pop_growth_rate_sd = c(0,2),
+ prior_pars_global_error_sigma = c(0,2),
model = "vb_multi_ind")
}
-#' Data configuration template for linear growth single individual model
+#' Data configuration template for affine growth single individual model
#' @keywords internal
#' @noRd
#'
-hmde_linear_single_ind <- function(){
+hmde_affine_single_ind <- function(){
list(step_size = NULL,
n_obs = NULL,
y_obs = NULL,
obs_index = NULL,
time = NULL,
- y_0_obs = NULL,
+ int_method = NULL,
y_bar = NULL,
- model = "linear_single_ind")
+ prior_pars_ind_const = c(1,2),
+ prior_pars_ind_beta_1 = c(0,2),
+ model = "affine_single_ind")
}
diff --git a/R/hmde_plot_de_pieces.R b/R/hmde_plot_de_pieces.R
index af56427..ae9f4d2 100644
--- a/R/hmde_plot_de_pieces.R
+++ b/R/hmde_plot_de_pieces.R
@@ -27,8 +27,8 @@ hmde_plot_de_pieces <- function(model = NULL,
colour = "#006600",
alpha = 0.4){
#Check for model
- if(!model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
if(is.null(model)){
@@ -89,9 +89,9 @@ hmde_ggplot_de_pieces <- function(pars_data,
plot <- ggplot() +
xlim(min(y_0), max(y_final)) +
labs(x = xlab, y = ylab, title = title) +
- theme_classic() +
- theme(axis.text=element_text(size=16),
- axis.title=element_text(size=18,face="bold"))
+ theme_classic()# +
+ #theme(axis.text=element_text(size=16),
+ # axis.title=element_text(size=18,face="bold"))
for(i in 1:nrow(pars_data)){
args_list <- list(pars=pars_data[i,-1]) #Remove ind_id
diff --git a/R/hmde_plot_obs_est_inds.R b/R/hmde_plot_obs_est_inds.R
new file mode 100644
index 0000000..b1f0f65
--- /dev/null
+++ b/R/hmde_plot_obs_est_inds.R
@@ -0,0 +1,78 @@
+#' Plot estimated and observed values over time for a chosen number of individuals based
+#' on posterior estimates. Structured to take in the measurement_data tibble constructed by
+#' the hmde_extract_estimates function.
+#'
+#' @param measurement_data tibble with estimated measurements
+#' @param ind_id_vec vector with list of ind_id values
+#' @param n_ind_to_plot integer giving number of individuals to plot if not speciried
+#' @param xlab character string for replacement x axis label
+#' @param ylab character string for replacement y axis label
+#' @param title character string for replacement plot title
+#'
+#' @return ggplot object
+#' @export
+#' @import ggplot2
+#' @import dplyr
+
+hmde_plot_obs_est_inds <- function(ind_id_vec = NULL,
+ n_ind_to_plot = NULL,
+ measurement_data = NULL,
+ xlab = "Time",
+ ylab = "Y(t)",
+ title = NULL){
+ if(is.null(measurement_data)){
+ stop("Measurement data not provided.")
+ }
+
+ if(!is.null(ind_id_vec)){
+ for(i in ind_id_vec){ #Error if ID nujmber not in data.
+ if(!i %in% measurement_data$ind_id){
+ stop(paste0("Ind ID values not recognised: ", i))
+ }
+ }
+
+ plot_data <- measurement_data %>%
+ filter(ind_id %in% ind_id_vec)
+
+ } else if(is.null(n_ind_to_plot)){ #Error if no info for which inds to plot
+ stop("Neither ind. ID values nor a number of individuals provided.")
+
+ } else if(length(unique(measurement_data$ind_id)) < n_ind_to_plot){
+ stop("Number of individuals to plot larger than sample size, please provide a smaller number.")
+
+ } else {
+ sample_ids <- sample(unique(measurement_data$ind_id), size=n_ind_to_plot)
+ plot_data <- measurement_data %>%
+ filter(ind_id %in% sample_ids)
+ }
+
+ #Generate plot
+ plot <- hmde_ggplot_obs_est_inds(plot_data = plot_data,
+ xlab = xlab,
+ ylab = ylab,
+ title = title)
+
+ return(plot)
+}
+
+#' Produce plot for plot_obs_est_inds
+#' @keywords internal
+#' @noRd
+hmde_ggplot_obs_est_inds <- function(plot_data,
+ xlab,
+ ylab,
+ title){
+ plot <- ggplot(data=plot_data, aes(group = ind_id)) +
+ geom_point(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
+ shape = 1) +
+ geom_line(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
+ linetype = "dashed") +
+ geom_point(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
+ shape = 2) +
+ geom_line(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
+ linetype = "solid") +
+ labs(x=xlab, y=ylab, colour="Ind. ID") +
+ theme_classic()
+
+ return(plot)
+}
diff --git a/R/hmde_run.R b/R/hmde_run.R
index aeae52e..11f5cb8 100644
--- a/R/hmde_run.R
+++ b/R/hmde_run.R
@@ -8,20 +8,19 @@
hmde_run <- function(model_template, ...) {
#Check for model
- if(!model_template$model %in% hmde_model_name()){
- stop("Model name not recognised. Run hmde_model_name() to see available models.")
+ if(!model_template$model %in% hmde_model_names()){
+ stop("Model name not recognised. Run hmde_model_names() to see available models.")
}
# Detect model
out <- switch(model_template$model,
- linear = rstan::sampling(stanmodels$linear, data = model_template, ...),
constant_single_ind = rstan::sampling(stanmodels$constant_single_ind, data = model_template, ...),
constant_multi_ind = rstan::sampling(stanmodels$constant_multi_ind, data = model_template, ...),
canham_single_ind = rstan::sampling(stanmodels$canham_single_ind, data = model_template, ...),
canham_multi_ind = rstan::sampling(stanmodels$canham_multi_ind, data = model_template, ...),
vb_single_ind = rstan::sampling(stanmodels$vb_single_ind, data = model_template, ...),
vb_multi_ind = rstan::sampling(stanmodels$vb_multi_ind, data = model_template, ...),
- linear_single_ind = rstan::sampling(stanmodels$linear_single_ind, data = model_template, ...))
+ affine_single_ind = rstan::sampling(stanmodels$affine_single_ind, data = model_template, ...))
return(out)
}
diff --git a/R/stanmodels.R b/R/stanmodels.R
index 71a80ae..ebbeff2 100644
--- a/R/stanmodels.R
+++ b/R/stanmodels.R
@@ -1,14 +1,14 @@
# Generated by rstantools. Do not edit by hand.
# names of stan models
-stanmodels <- c("canham_multi_ind", "canham_single_ind", "constant_multi_ind", "constant_single_ind", "linear_single_ind", "vb_multi_ind", "vb_single_ind")
+stanmodels <- c("affine_single_ind", "canham_multi_ind", "canham_single_ind", "constant_multi_ind", "constant_single_ind", "vb_multi_ind", "vb_single_ind")
# load each stan module
+Rcpp::loadModule("stan_fit4affine_single_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4canham_multi_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4canham_single_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4constant_multi_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4constant_single_ind_mod", what = TRUE)
-Rcpp::loadModule("stan_fit4linear_single_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4vb_multi_ind_mod", what = TRUE)
Rcpp::loadModule("stan_fit4vb_single_ind_mod", what = TRUE)
diff --git a/README.Rmd b/README.Rmd
index 47cec31..70ecf08 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -4,10 +4,11 @@ output: github_document
# hmde
-The goal of `hmde` is to implement hierarchical Bayesian longitudinal models to solve the Bayesian inverse problem of estimating differential equation parameters based on repeat measurement surveys. Estimation is done using Markov Chain Monte Carlo, implemented through
-[Stan](https://mc-stan.org/) via [RStan](https://mc-stan.org/users/interfaces/rstan), built under [R](https://cran.r-project.org/) 4.3.3. The inbuilt models are based on case studies in ecology. The hierarchical Bayesian longitudinal method was first introduced in O'Brien et al., [2024](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.14463).
+The goal of hmde is to fit a model for the rate of change in some quantity based on a set of pre-defined functions arising from ecological applications. We estimate differential equation parameters from repeated observations of a process, such as growth rate parameters to data of sizes over time.
+In other language, `hmde` implements hierarchical Bayesian longitudinal models to solve the Bayesian inverse problem of estimating differential equation parameters based on repeat measurement surveys. Estimation is done using Markov Chain Monte Carlo, implemented through
+[Stan](https://mc-stan.org/) via [RStan](https://mc-stan.org/users/interfaces/rstan), built under [R](https://cran.r-project.org/) 4.3.3. The inbuilt models are based on case studies in ecology.
-As `hmde` is first intended for biologists, the initial set of vignettes (`hmde`, `constant-growth`, `von-bertalanffy`, and `canham`) are written aimed at an audience more interested in applications than the underlying theory. A vignette for the more mathematically interested is under development.
+A pre-print paper is available on [bioarXiv](https://doi.org/10.1101/2025.01.15.633280), or as the [hmde_paper.pdf](https://github.com/traitecoevo/hmde/blob/master/inst/doc/hmde_paper.pdf) file here.
## The Maths
@@ -44,8 +45,6 @@ model is a hump-shaped function given by
$$f \left( Y \left( t \right), f_{max}, Y_{max}, k \right) = \frac{dY}{dt} = f_{max} \exp \Bigg( -\frac{1}{2} \bigg( \frac{ \ln \left( Y \left( t \right) / Y_{max} \right) }{k} \bigg)^2 \Bigg), $$
where $f_{max}$ is the maximum growth rate, $Y_{max}$ is the $Y$-value at which that maximum occurs, and $k$ controls how narrow or wide the peak is.
-##
-
## Installation
‘hmde’ is under active development. You can install the current
diff --git a/README.md b/README.md
index 11e117d..746b193 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,23 @@
# hmde
-The goal of `hmde` is to implement hierarchical Bayesian longitudinal
-models to solve the Bayesian inverse problem of estimating differential
-equation parameters based on repeat measurement surveys. Estimation is
-done using Markov Chain Monte Carlo, implemented through
-[Stan](https://mc-stan.org/) via
+The goal of hmde is to fit a model for the rate of change in some
+quantity based on a set of pre-defined functions arising from ecological
+applications. We estimate differential equation parameters from repeated
+observations of a process, such as growth rate parameters to data of
+sizes over time. In other language, `hmde` implements hierarchical
+Bayesian longitudinal models to solve the Bayesian inverse problem of
+estimating differential equation parameters based on repeat measurement
+surveys. Estimation is done using Markov Chain Monte Carlo, implemented
+through [Stan](https://mc-stan.org/) via
[RStan](https://mc-stan.org/users/interfaces/rstan), built under
[R](https://cran.r-project.org/) 4.3.3. The inbuilt models are based on
-case studies in ecology. The hierarchical Bayesian longitudinal method
-was first introduced in O’Brien et al.,
-[2024](https://besjournals.onlinelibrary.wiley.com/doi/10.1111/2041-210X.14463).
+case studies in ecology.
-As `hmde` is first intended for biologists, the initial set of vignettes
-(`hmde`, `constant-growth`, `von-bertalanffy`, and `canham`) are written
-aimed at an audience more interested in applications than the underlying
-theory. A vignette for the more mathematically interested is under
-development.
+A pre-print paper is available on
+[bioarXiv](https://doi.org/10.1101/2025.01.15.633280), or as the
+[hmde_paper.pdf](https://github.com/traitecoevo/hmde/blob/master/inst/doc/hmde_paper.pdf)
+file here.
## The Maths
@@ -70,8 +71,6 @@ where $f_{max}$ is the maximum growth rate, $Y_{max}$ is the $Y$-value
at which that maximum occurs, and $k$ controls how narrow or wide the
peak is.
-##
-
## Installation
‘hmde’ is under active development. You can install the current
diff --git a/Read-and-delete-me b/Read-and-delete-me
deleted file mode 100644
index 7ae89b5..0000000
--- a/Read-and-delete-me
+++ /dev/null
@@ -1,10 +0,0 @@
-Stan-specific notes:
-
-* All '.stan' files containing stanmodel definitions must be placed in 'inst/stan'.
-* Additional files to be included by stanmodel definition files
- (via e.g., #include "mylib.stan") must be placed in any subfolder of 'inst/stan'.
-* Additional C++ files needed by any '.stan' file must be placed in 'inst/include',
- and can only interact with the Stan C++ library via '#include' directives
- placed in the file 'inst/include/stan_meta_header.hpp'.
-* The precompiled stanmodel objects will appear in a named list called 'stanmodels',
- and you can call them with e.g., 'rstan::sampling(stanmodels$foo, ...)'
diff --git a/data/Tree_Size_Ests.rda b/data/Tree_Size_Ests.rda
index a863e24..88d2fd4 100644
Binary files a/data/Tree_Size_Ests.rda and b/data/Tree_Size_Ests.rda differ
diff --git a/hmde.Rproj b/hmde.Rproj
index 6a3ede2..fd50596 100644
--- a/hmde.Rproj
+++ b/hmde.Rproj
@@ -1,4 +1,5 @@
Version: 1.0
+ProjectId: 276917f4-3da1-492e-bb45-7ffd205ea048
RestoreWorkspace: No
SaveWorkspace: No
diff --git a/inst/article/hmde_paper.pdf b/inst/article/hmde_paper.pdf
new file mode 100644
index 0000000..191ad41
Binary files /dev/null and b/inst/article/hmde_paper.pdf differ
diff --git a/inst/stan/affine_single_ind.stan b/inst/stan/affine_single_ind.stan
new file mode 100644
index 0000000..47376d7
--- /dev/null
+++ b/inst/stan/affine_single_ind.stan
@@ -0,0 +1,173 @@
+//Growth function
+functions{
+ //Growth function for use with Runge-Kutta method
+ //pars = (const, beta_1, y_bar)
+ real DE_rk4(real y, array[] real pars){ //change number of pars
+ return pars[1] - (pars[2] * (y-pars[3])); //growth function
+ }
+
+ real rk4_step(real y, array[] real pars, real interval){
+ real k1;
+ real k2;
+ real k3;
+ real k4;
+ real y_hat;
+
+ k1 = DE_rk4(y, pars);
+ k2 = DE_rk4(y+interval*k1/2.0, pars);
+ k3 = DE_rk4(y+interval*k2/2.0, pars);
+ k4 = DE_rk4(y+interval*k3, pars);
+
+ y_hat = y + (1.0/6.0) * (k1 + 2.0*k2 + 2.0*k3 + k4) * interval;
+
+ return y_hat;
+ }
+
+ real rk4(real y, array[] real pars, real interval, real step_size){
+ int steps;
+ real duration;
+ real y_hat;
+ real step_size_temp;
+
+ duration = 0;
+ y_hat = y;
+
+ while(duration < interval){
+ //Determine the relevant step size
+ step_size_temp = min([step_size, interval-duration]);
+
+ //Get next size estimate
+ y_hat = rk4_step(y_hat, pars, step_size_temp);
+
+ //Increment observed duration
+ duration = duration + step_size_temp;
+ }
+
+ return y_hat;
+ }
+
+ vector DE_RK45(real t, vector y, real ind_const, real beta_1, real y_bar){
+ vector[size(y)] dydt = ind_const - (beta_1 * (y-y_bar));
+ return dydt;
+ }
+
+ real analytic_solution(real t, real y_0, real ind_const, real beta_1, real y_bar){
+ real y_0_translate = y_0 - y_bar;
+ return ind_const/beta_1 + (y_0_translate - (ind_const/beta_1)) * exp(-beta_1 * t) + y_bar;
+ }
+}
+
+// Data structure
+data {
+ real step_size;
+ int n_obs;
+ real y_obs[n_obs];
+ int obs_index[n_obs];
+ real time[n_obs];
+ real y_bar;
+ int int_method;
+ real prior_pars_ind_const[2]; //vector of pars for beta_c parameter priors
+ real prior_pars_ind_beta_1[2]; //Vector of pars for beta_1 parameter priors
+}
+
+// The parameters accepted by the model.
+parameters {
+ //Individual level
+ real ind_y_0;
+ real ind_const;
+ real ind_beta_1;
+}
+
+// The model to be estimated.
+model {
+ real y_hat[n_obs];
+ array[3] real pars;
+ vector[1] y_temp;
+
+ pars[1] = ind_const;
+ pars[2] = ind_beta_1;
+ pars[3] = y_bar;
+
+ for(i in 1:n_obs){
+
+ if(obs_index[i]==1){//Fits the first size
+ y_hat[i] = ind_y_0;
+ }
+
+ if(i < n_obs){
+ //Estimate next size
+ if(int_method == 1){ //RK4
+ y_hat[i+1] = rk4(y_hat[i], pars, (time[i+1] - time[i]), step_size);
+ }
+
+ if(int_method == 2){ //RK45
+ y_temp[1] = y_hat[i];
+ y_hat[i+1] = ode_rk45(DE_RK45, y_temp,
+ time[i], {time[i+1]},
+ ind_const, ind_beta_1, y_bar)[1][1];
+ }
+
+ if(int_method == 3){ //Analytic solution
+ y_hat[i+1] = analytic_solution((time[i+1] - time[1]),
+ ind_y_0,
+ ind_const,
+ ind_beta_1,
+ y_bar);
+ }
+ }
+ }
+
+ //Likelihood
+ y_obs ~ normal(y_hat, 0.1);
+
+ //Priors
+ //Individual level
+ ind_const ~normal(prior_pars_ind_const[1], prior_pars_ind_const[2]);
+ ind_beta_1 ~lognormal(prior_pars_ind_beta_1[1], prior_pars_ind_beta_1[2]);
+}
+
+generated quantities{
+ real y_hat[n_obs];
+ array[3] real pars;
+ real ind_beta_0;
+ vector[1] y_temp;
+
+ //Return the used prior parameters
+ real check_prior_pars_ind_const[2] = prior_pars_ind_const;
+ real check_prior_pars_ind_beta[2] = prior_pars_ind_beta_1;
+
+ ind_beta_0 = ind_const + ind_beta_1*y_bar;
+
+ pars[1] = ind_const;
+ pars[2] = ind_beta_1;
+ pars[3] = y_bar;
+
+ for(i in 1:n_obs){
+
+ if(obs_index[i]==1){//Fits the first size
+ y_hat[i] = ind_y_0;
+ }
+
+ if(i < n_obs){
+ //Estimate next size
+ if(int_method == 1){ //RK4
+ y_hat[i+1] = rk4(y_hat[i], pars, (time[i+1] - time[i]), step_size);
+ }
+
+ if(int_method == 2){ //RK45
+ y_temp[1] = y_hat[i];
+ y_hat[i+1] = ode_rk45(DE_RK45, y_temp,
+ time[i], {time[i+1]},
+ ind_const, ind_beta_1, y_bar)[1][1];
+ }
+
+ if(int_method == 3){ //Analytic solution
+ y_hat[i+1] = analytic_solution((time[i+1] - time[1]),
+ ind_y_0,
+ ind_const,
+ ind_beta_1,
+ y_bar);
+ }
+ }
+ }
+}
diff --git a/inst/stan/canham_multi_ind.stan b/inst/stan/canham_multi_ind.stan
index 60ca907..8823442 100644
--- a/inst/stan/canham_multi_ind.stan
+++ b/inst/stan/canham_multi_ind.stan
@@ -16,7 +16,13 @@ data {
int obs_index[n_obs];
real time[n_obs];
int ind_id[n_obs];
- real y_0_obs[n_ind];
+ real prior_pars_pop_max_growth_mean[2];
+ real prior_pars_pop_max_growth_sd[2];
+ real prior_pars_pop_size_at_max_growth_mean[2];
+ real prior_pars_pop_size_at_max_growth_sd[2];
+ real prior_pars_pop_k_mean[2];
+ real prior_pars_pop_k_sd[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -67,7 +73,6 @@ model {
//Priors
//Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
ind_max_growth ~lognormal(pop_max_growth_mean,
pop_max_growth_sd);
ind_size_at_max_growth ~lognormal(pop_size_at_max_growth_mean,
@@ -76,21 +81,35 @@ model {
pop_k_sd);
//Species level
- pop_max_growth_mean ~normal(0, 1);
- pop_max_growth_sd ~cauchy(0, 1);
- pop_size_at_max_growth_mean ~normal(0, 1);
- pop_size_at_max_growth_sd ~cauchy(0, 1);
- pop_k_mean ~normal(0, 1);
- pop_k_sd ~cauchy(0, 1);
+ pop_max_growth_mean ~normal(prior_pars_pop_max_growth_mean[1],
+ prior_pars_pop_max_growth_mean[2]);
+ pop_max_growth_sd ~cauchy(prior_pars_pop_max_growth_sd[1],
+ prior_pars_pop_max_growth_sd[2]);
+ pop_size_at_max_growth_mean ~normal(prior_pars_pop_size_at_max_growth_mean[1],
+ prior_pars_pop_size_at_max_growth_mean[2]);
+ pop_size_at_max_growth_sd ~cauchy(prior_pars_pop_size_at_max_growth_sd[1],
+ prior_pars_pop_size_at_max_growth_sd[2]);
+ pop_k_mean ~normal(prior_pars_pop_k_mean[1], prior_pars_pop_k_mean[2]);
+ pop_k_sd ~cauchy(prior_pars_pop_k_sd[1], prior_pars_pop_k_sd[2]);
//Global level
- global_error_sigma ~cauchy(0, 2);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
generated quantities{
real y_hat[n_obs];
vector[1] y_temp;
+ //Return the used prior parameters
+ real check_prior_pars_pop_max_growth_mean[2] = prior_pars_pop_max_growth_mean;
+ real check_prior_pars_pop_max_growth_sd[2] = prior_pars_pop_max_growth_sd;
+ real check_prior_pars_pop_size_at_max_growth_mean[2] = prior_pars_pop_size_at_max_growth_mean;
+ real check_prior_pars_pop_size_at_max_growth_sd[2] = prior_pars_pop_size_at_max_growth_sd;
+ real check_prior_pars_pop_k_mean[2] = prior_pars_pop_k_mean;
+ real check_prior_pars_pop_k_sd[2] = prior_pars_pop_k_sd;
+ real check_prior_pars_global_error_sigma[2] = prior_pars_global_error_sigma;
+
for(i in 1:n_obs){
if(obs_index[i]==1){//Fits the first size
diff --git a/inst/stan/canham_single_ind.stan b/inst/stan/canham_single_ind.stan
index 8babfc8..05a8e86 100644
--- a/inst/stan/canham_single_ind.stan
+++ b/inst/stan/canham_single_ind.stan
@@ -14,7 +14,10 @@ data {
real y_obs[n_obs];
int obs_index[n_obs];
real time[n_obs];
- real y_0_obs;
+ real prior_pars_ind_max_growth[2];
+ real prior_pars_ind_size_at_max_growth[2];
+ real prior_pars_ind_k[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -54,19 +57,27 @@ model {
//Priors
//Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
- ind_max_growth ~lognormal(0, 1);
- ind_size_at_max_growth ~lognormal(3, 1);
- ind_k ~lognormal(0, 1);
+ ind_max_growth ~lognormal(prior_pars_ind_max_growth[1],
+ prior_pars_ind_max_growth[2]);
+ ind_size_at_max_growth ~lognormal(prior_pars_ind_size_at_max_growth[1],
+ prior_pars_ind_size_at_max_growth[2]);
+ ind_k ~lognormal(prior_pars_ind_k[1], prior_pars_ind_k[2]);
//Global level
- global_error_sigma ~cauchy(0,5);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
generated quantities{
real y_hat[n_obs];
vector[1] y_temp;
+ //Return the used prior parameters
+ real check_prior_pars_ind_max_growth[2] = prior_pars_ind_max_growth;
+ real check_prior_pars_ind_size_at_max_growth[2] = prior_pars_ind_size_at_max_growth;
+ real check_prior_pars_ind_k[2] = prior_pars_ind_k;
+ real check_prior_pars_global_error_sigma[2] = prior_pars_global_error_sigma;
+
for(i in 1:n_obs){
if(obs_index[i]==1){//Fits the first size
@@ -79,8 +90,9 @@ generated quantities{
//Estimate next size
y_hat[i+1] = ode_rk45(DE, y_temp,
time[i], {time[i+1]},
- ind_max_growth, ind_size_at_max_growth, ind_k)[1][1];
-
+ ind_max_growth,
+ ind_size_at_max_growth,
+ ind_k)[1][1];
}
}
}
diff --git a/inst/stan/constant_multi_ind.stan b/inst/stan/constant_multi_ind.stan
index 16d0809..a1bb379 100644
--- a/inst/stan/constant_multi_ind.stan
+++ b/inst/stan/constant_multi_ind.stan
@@ -18,7 +18,9 @@ data {
int obs_index[n_obs];
real time[n_obs];
int ind_id[n_obs];
- real y_0_obs[n_ind];
+ real prior_pars_pop_beta_mu[2];
+ real prior_pars_pop_beta_sigma[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -57,22 +59,32 @@ model {
//Priors
//Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
ind_beta ~ lognormal(pop_beta_mu,
pop_beta_sigma);
//Population level
- pop_beta_mu ~ normal(0.1, 1);
- pop_beta_sigma ~cauchy(0.1, 1);
+ pop_beta_mu ~ normal(prior_pars_pop_beta_mu[1],
+ prior_pars_pop_beta_mu[2]);
+ pop_beta_sigma ~cauchy(prior_pars_pop_beta_sigma[1],
+ prior_pars_pop_beta_sigma[2]);
//Global level
- global_error_sigma ~cauchy(0.1, 1);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
// The output
generated quantities {
real y_hat[n_obs];
- real Delta_hat[n_obs];
+
+ //Return the used prior parameters
+ real check_prior_pars_pop_beta_mu[2];
+ real check_prior_pars_pop_beta_sigma[2];
+ real check_prior_pars_global_error_sigma[2];
+
+ check_prior_pars_pop_beta_mu = prior_pars_pop_beta_mu;
+ check_prior_pars_pop_beta_sigma = prior_pars_pop_beta_sigma;
+ check_prior_pars_global_error_sigma = prior_pars_global_error_sigma;
for(i in 1:n_obs){
@@ -85,12 +97,7 @@ generated quantities {
if(i < n_obs){
if(ind_id[i+1]==ind_id[i]){
y_hat[i+1] = size_step(y_hat[i], ind_beta[ind_id[i]], (time[i+1]-time[i]));
- Delta_hat[i] = y_hat[i+1] - y_hat[i];
- } else {
- Delta_hat[i] = DE(ind_beta[ind_id[i]]) * (time[i]-time[i-1]);
}
- } else {
- Delta_hat[i] = DE(ind_beta[ind_id[i]]) * (time[i]-time[i-1]);
}
}
}
diff --git a/inst/stan/constant_single_ind.stan b/inst/stan/constant_single_ind.stan
index cd7570a..55aff46 100644
--- a/inst/stan/constant_single_ind.stan
+++ b/inst/stan/constant_single_ind.stan
@@ -16,7 +16,8 @@ data {
real y_obs[n_obs];
int obs_index[n_obs];
real time[n_obs];
- real y_0_obs;
+ real prior_pars_ind_beta[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -50,17 +51,21 @@ model {
//Priors
//Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
- ind_beta ~ lognormal(0.1, 1);
+ ind_beta ~ lognormal(prior_pars_ind_beta[1],
+ prior_pars_ind_beta[2]);
//Global level
- global_error_sigma ~cauchy(0.1, 1);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
// The output
generated quantities {
real y_hat[n_obs];
- real Delta_hat[n_obs];
+
+ //Return the used prior parameters
+ real check_prior_pars_ind_beta[2] = prior_pars_ind_beta;
+ real check_prior_pars_global_error_sigma[2] = prior_pars_global_error_sigma;
for(i in 1:n_obs){
//Fits the first size
@@ -71,9 +76,6 @@ generated quantities {
// Estimate next size
if(i < n_obs){
y_hat[i+1] = size_step(y_hat[i], ind_beta, (time[i+1]-time[i]));
- Delta_hat[i] = y_hat[i+1] - y_hat[i];
- } else {
- Delta_hat[i] = DE(ind_beta) * (time[i]-time[i-1]);
}
}
}
diff --git a/inst/stan/linear_single_ind.stan b/inst/stan/linear_single_ind.stan
deleted file mode 100644
index a5e53c3..0000000
--- a/inst/stan/linear_single_ind.stan
+++ /dev/null
@@ -1,133 +0,0 @@
-//Growth function
-functions{
- //Growth function for use with Runge-Kutta method
- //pars = (growth_par, max_size)
- real DE(real y, array[] real pars){ //change number of pars
- return pars[1] - (pars[2] * (y-pars[3])); //growth function
- }
-
- real rk4_step(real y, array[] real pars, real interval){
- real k1;
- real k2;
- real k3;
- real k4;
- real y_hat;
-
- k1 = DE(y, pars);
- k2 = DE(y+interval*k1/2.0, pars);
- k3 = DE(y+interval*k2/2.0, pars);
- k4 = DE(y+interval*k3, pars);
-
- y_hat = y + (1.0/6.0) * (k1 + 2.0*k2 + 2.0*k3 + k4) * interval;
-
- return y_hat;
- }
-
- real rk4(real y, array[] real pars, real interval, real step_size){
- int steps;
- real duration;
- real y_hat;
- real step_size_temp;
-
- duration = 0;
- y_hat = y;
-
- while(duration < interval){
- //Determine the relevant step size
- step_size_temp = min([step_size, interval-duration]);
-
- //Get next size estimate
- y_hat = rk4_step(y_hat, pars, step_size_temp);
-
- //Increment observed duration
- duration = duration + step_size_temp;
- }
-
- return y_hat;
- }
-}
-
-// Data structure
-data {
- real step_size;
- int n_obs;
- real y_obs[n_obs];
- int obs_index[n_obs];
- real time[n_obs];
- real y_bar;
- real y_0_obs;
-}
-
-// The parameters accepted by the model.
-parameters {
- //Individual level
- real ind_y_0;
- real ind_const;
- real ind_beta_1;
-
- //Global level
- real global_error_sigma;
-}
-
-// The model to be estimated.
-model {
- real y_hat[n_obs];
- array[3] real pars;
-
- pars[1] = ind_const;
- pars[2] = ind_beta_1;
- pars[3] = y_bar;
-
- for(i in 1:n_obs){
-
- if(obs_index[i]==1){//Fits the first size
- y_hat[i] = ind_y_0;
- }
-
- if(i < n_obs){
- //Estimate growth rate
- y_hat[i+1] = rk4(y_hat[i], pars, (time[i+1] - time[i]), step_size);
- }
- }
-
- //Likelihood
- y_obs ~ normal(y_hat, global_error_sigma);
-
- //Priors
- //Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
- ind_const ~lognormal(0, 5);
- ind_beta_1 ~lognormal(0, 5);
-
- //Global level
- global_error_sigma ~cauchy(0,1);
-}
-
-generated quantities{
- real y_hat[n_obs];
- real Delta_hat[n_obs];
- array[3] real pars;
- real ind_beta_0;
-
- ind_beta_0 = ind_const + ind_beta_1*y_bar;
-
- pars[1] = ind_const;
- pars[2] = ind_beta_1;
- pars[3] = y_bar;
-
- for(i in 1:n_obs){
-
- if(obs_index[i]==1){//Fits the first size
- y_hat[i] = ind_y_0;
- }
-
- if(i < n_obs){
- //Estimate next size
- y_hat[i+1] = rk4(y_hat[i], pars, (time[i+1] - time[i]), step_size);
- Delta_hat[i] = y_hat[i+1] - y_hat[i];
-
- } else {
- Delta_hat[i] = DE(y_hat[i], pars)*(time[i] - time[i-1]);
- }
- }
-}
diff --git a/inst/stan/vb_multi_ind.stan b/inst/stan/vb_multi_ind.stan
index e2c9ac6..27e3ab9 100644
--- a/inst/stan/vb_multi_ind.stan
+++ b/inst/stan/vb_multi_ind.stan
@@ -15,8 +15,12 @@ data {
int obs_index[n_obs];
real time[n_obs];
int ind_id[n_obs];
- real y_0_obs[n_ind];
real y_bar;
+ real prior_pars_pop_max_size_mean_sd_only;
+ real prior_pars_pop_max_size_sd[2];
+ real prior_pars_pop_growth_rate_mean[2];
+ real prior_pars_pop_growth_rate_sd[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -64,25 +68,35 @@ model {
//Priors
//Individual level
- ind_y_0 ~normal(y_0_obs, global_error_sigma);
ind_growth_rate ~lognormal(pop_growth_rate_mean, pop_growth_rate_sd);
ind_max_size ~lognormal(pop_max_size_mean, pop_max_size_sd);
//Population level
- pop_growth_rate_mean ~normal(0, 2);
- pop_growth_rate_sd ~cauchy(0, 2);
- pop_max_size_mean ~normal(max(log(y_obs)), 2);
- pop_max_size_sd ~cauchy(0, 2);
+ pop_max_size_mean ~normal(log(max(y_obs)),
+ prior_pars_pop_max_size_mean_sd_only);
+ pop_max_size_sd ~cauchy(prior_pars_pop_max_size_sd[1],
+ prior_pars_pop_max_size_sd[2]);
+ pop_growth_rate_mean ~normal(prior_pars_pop_growth_rate_mean[1],
+ prior_pars_pop_growth_rate_mean[2]);
+ pop_growth_rate_sd ~cauchy(prior_pars_pop_growth_rate_sd[1],
+ prior_pars_pop_growth_rate_sd[2]);
//Global level
- global_error_sigma ~cauchy(0, 5);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
generated quantities{
real y_hat[n_obs];
- real Delta_hat[n_obs];
array[3] real pars;
- real temp_y_final;
+
+ //Return the used prior parameters
+ real check_prior_pars_pop_max_size_mean_sd_only = prior_pars_pop_max_size_mean_sd_only;
+ real check_prior_pars_pop_max_size_mean_mean_max_obs = log(max(y_obs));
+ real check_prior_pars_pop_max_size_sd[2] = prior_pars_pop_max_size_sd;
+ real check_prior_pars_pop_growth_rate_mean[2] = prior_pars_pop_growth_rate_mean;
+ real check_prior_pars_pop_growth_rate_sd[2] = prior_pars_pop_growth_rate_sd;
+ real check_prior_pars_global_error_sigma[2] = prior_pars_global_error_sigma;
for(i in 1:n_obs){
// Initialise the parameters for the observation
@@ -98,14 +112,7 @@ generated quantities{
if(ind_id[i+1]==ind_id[i]){
//Estimate next size
y_hat[i+1] = solution(time[i+1], pars) + y_bar;
- Delta_hat[i] = y_hat[i+1] - y_hat[i];
- } else {// Estimate next growth based on same time to last.
- temp_y_final = solution(2*time[i] - time[i-1], pars) + y_bar;
- Delta_hat[i] = temp_y_final - y_hat[i];
}
- } else {
- temp_y_final = solution(2*time[i] - time[i-1], pars) + y_bar;
- Delta_hat[i] = temp_y_final - y_hat[i];
}
}
}
diff --git a/inst/stan/vb_single_ind.stan b/inst/stan/vb_single_ind.stan
index e37e054..7f7a1b5 100644
--- a/inst/stan/vb_single_ind.stan
+++ b/inst/stan/vb_single_ind.stan
@@ -13,8 +13,10 @@ data {
real y_obs[n_obs];
int obs_index[n_obs];
real time[n_obs];
- real y_0_obs;
real y_bar;
+ real prior_pars_ind_max_size_sd_only;
+ real prior_pars_ind_growth_rate[2];
+ real prior_pars_global_error_sigma[2];
}
// The parameters accepted by the model.
@@ -54,25 +56,30 @@ model {
//Priors
//Individual level
- ind_y_0 ~ normal(y_0_obs, global_error_sigma);
- ind_max_size ~lognormal(0, 1);
- ind_growth_rate ~lognormal(0, 1); //Take max obs. size as average value
+ ind_max_size ~lognormal(log(max(y_obs)),
+ prior_pars_ind_max_size_sd_only);
+ ind_growth_rate ~lognormal(prior_pars_ind_growth_rate[1],
+ prior_pars_ind_growth_rate[2]);
//Global level
- global_error_sigma ~cauchy(0,1);
+ global_error_sigma ~cauchy(prior_pars_global_error_sigma[1],
+ prior_pars_global_error_sigma[2]);
}
generated quantities{
real y_hat[n_obs];
- real Delta_hat[n_obs];
array[3] real pars;
+ //Return the used prior parameters
+ real check_prior_pars_ind_max_size_sd_only = prior_pars_ind_max_size_sd_only;
+ real check_prior_pars_ind_max_size_mean_max_obs = log(max(y_obs));
+ real check_prior_pars_ind_growth_rate[2] = prior_pars_ind_growth_rate;
+ real check_prior_pars_global_error_sigma[2] = prior_pars_global_error_sigma;
+
pars[1] = ind_max_size - y_bar;
pars[2] = ind_growth_rate;
pars[3] = ind_y_0 - y_bar;
- real temp_y_final;
-
for(i in 1:n_obs){
if(obs_index[i]==1){//Fits the first size
@@ -82,11 +89,6 @@ generated quantities{
if(i < n_obs){
//Estimate next size
y_hat[i+1] = solution(time[i+1], pars) + y_bar;
- Delta_hat[i] = y_hat[i+1] - y_hat[i];
-
- } else { // Estimate next growth based on same time to last.
- temp_y_final = solution(2*time[i] - time[i-1], pars) + y_bar;
- Delta_hat[i] = temp_y_final - y_hat[i];
}
}
}
diff --git a/man/hmde_linear_de.Rd b/man/hmde_affine_de.Rd
similarity index 55%
rename from man/hmde_linear_de.Rd
rename to man/hmde_affine_de.Rd
index 560d84a..9cf0e51 100644
--- a/man/hmde_linear_de.Rd
+++ b/man/hmde_affine_de.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmde_model_des.R
-\name{hmde_linear_de}
-\alias{hmde_linear_de}
-\title{Differential equation for linear growth single individual model}
+\name{hmde_affine_de}
+\alias{hmde_affine_de}
+\title{Differential equation for affine growth single individual model}
\usage{
-hmde_linear_de(y = NULL, pars = NULL)
+hmde_affine_de(y = NULL, pars = NULL)
}
\arguments{
\item{y}{input real}
@@ -15,5 +15,5 @@ hmde_linear_de(y = NULL, pars = NULL)
value of differential equation at y
}
\description{
-Differential equation for linear growth single individual model
+Differential equation for affine growth single individual model
}
diff --git a/man/hmde_assign_data.Rd b/man/hmde_assign_data.Rd
index c72e6de..39b86b0 100644
--- a/man/hmde_assign_data.Rd
+++ b/man/hmde_assign_data.Rd
@@ -4,15 +4,13 @@
\alias{hmde_assign_data}
\title{Assign data to template for chosen model}
\usage{
-hmde_assign_data(model_template, data = NULL, step_size = NULL, ...)
+hmde_assign_data(model_template, data = NULL, ...)
}
\arguments{
\item{model_template}{output from hmde_model}
\item{data}{Input data tibble with columns including time, y_obs, obs_index, and additionally ind_id for multi-individual models}
-\item{step_size}{Step size for numerical integration.}
-
\item{...}{data-masking name-value pairs allowing specific input of elements}
}
\value{
diff --git a/man/hmde_extract_estimates.Rd b/man/hmde_extract_estimates.Rd
index 93b89b4..be77cc4 100644
--- a/man/hmde_extract_estimates.Rd
+++ b/man/hmde_extract_estimates.Rd
@@ -4,11 +4,9 @@
\alias{hmde_extract_estimates}
\title{Extract samples and return measurement, individual, and population-level estimates}
\usage{
-hmde_extract_estimates(model = NULL, fit = NULL, input_measurement_data = NULL)
+hmde_extract_estimates(fit = NULL, input_measurement_data = NULL)
}
\arguments{
-\item{model}{model name character string}
-
\item{fit}{fitted model Stan fit}
\item{input_measurement_data}{data used to fit the model with ind_id, y_obs, time, obs_index tibble}
diff --git a/man/hmde_model_name.Rd b/man/hmde_model_names.Rd
similarity index 79%
rename from man/hmde_model_name.Rd
rename to man/hmde_model_names.Rd
index f8ecbb1..27b66f7 100644
--- a/man/hmde_model_name.Rd
+++ b/man/hmde_model_names.Rd
@@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/hmde_model_names.R
-\name{hmde_model_name}
-\alias{hmde_model_name}
+\name{hmde_model_names}
+\alias{hmde_model_names}
\title{Returns names of available models.}
\usage{
-hmde_model_name()
+hmde_model_names()
}
\value{
vector of character strings for model names.
diff --git a/man/hmde_plot_obs_est_inds.Rd b/man/hmde_plot_obs_est_inds.Rd
new file mode 100644
index 0000000..0940246
--- /dev/null
+++ b/man/hmde_plot_obs_est_inds.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/hmde_plot_obs_est_inds.R
+\name{hmde_plot_obs_est_inds}
+\alias{hmde_plot_obs_est_inds}
+\title{Plot estimated and observed values over time for a chosen number of individuals based
+on posterior estimates. Structured to take in the measurement_data tibble constructed by
+the hmde_extract_estimates function.}
+\usage{
+hmde_plot_obs_est_inds(
+ ind_id_vec = NULL,
+ n_ind_to_plot = NULL,
+ measurement_data = NULL,
+ xlab = "Time",
+ ylab = "Y(t)",
+ title = NULL
+)
+}
+\arguments{
+\item{ind_id_vec}{vector with list of ind_id values}
+
+\item{n_ind_to_plot}{integer giving number of individuals to plot if not speciried}
+
+\item{measurement_data}{tibble with estimated measurements}
+
+\item{xlab}{character string for replacement x axis label}
+
+\item{ylab}{character string for replacement y axis label}
+
+\item{title}{character string for replacement plot title}
+}
+\value{
+ggplot object
+}
+\description{
+Plot estimated and observed values over time for a chosen number of individuals based
+on posterior estimates. Structured to take in the measurement_data tibble constructed by
+the hmde_extract_estimates function.
+}
diff --git a/tests/testthat/fixtures/canham/canham_data_multi_ind.rds b/tests/testthat/fixtures/canham/canham_data_multi_ind.rds
index 52bd998..29c02d8 100644
Binary files a/tests/testthat/fixtures/canham/canham_data_multi_ind.rds and b/tests/testthat/fixtures/canham/canham_data_multi_ind.rds differ
diff --git a/tests/testthat/fixtures/canham/canham_data_single_ind.rds b/tests/testthat/fixtures/canham/canham_data_single_ind.rds
index b69df34..8b5e1f5 100644
Binary files a/tests/testthat/fixtures/canham/canham_data_single_ind.rds and b/tests/testthat/fixtures/canham/canham_data_single_ind.rds differ
diff --git a/tests/testthat/fixtures/canham/make_canham.R b/tests/testthat/fixtures/canham/make_canham.R
index 015fb0f..f97bdf3 100644
--- a/tests/testthat/fixtures/canham/make_canham.R
+++ b/tests/testthat/fixtures/canham/make_canham.R
@@ -30,8 +30,8 @@ DE_par_generation <- function(n_ind,
model_name <- "canham"
set.seed(2024) #Guarantees same data each time.
n_ind <- 3 #Number of individuals for multi-individual data. Single individual takes the first.
-n_obs_per_ind <- 10 #How many observations per individual.
-interval <- 5 #Time interval between observations
+n_obs_per_ind <- 20 #How many observations per individual.
+interval <- 1 #Time interval between observations
time = seq(from = 0, by = interval, length.out = n_obs_per_ind)
#Produce parameters
diff --git a/tests/testthat/helper-data_generation.R b/tests/testthat/helper-data_generation.R
index 570ae6e..dddfaee 100644
--- a/tests/testthat/helper-data_generation.R
+++ b/tests/testthat/helper-data_generation.R
@@ -60,14 +60,13 @@ hmde_export_test_data <- function(n_obs_per_ind,
true_data,
model_name,
step_size = 1){
- if(grepl("linear", model_name)){ #Step size for linear model
+ if(grepl("affine", model_name)){ #Step size for affine model
single_ind_data <- list(
step_size = step_size, #Number for model RK4 alg
n_obs = n_obs_per_ind, #Number
y_obs = y_obs[1:n_obs_per_ind], #Vector indexed by n_obs
obs_index = 1:n_obs_per_ind, #Vector indexed by n_obs
time = time, #Vector indexed by n_obs
- y_0_obs = y_obs[1], #Number
n_pars = ncol(DE_pars), #Number
single_true_data = list(
DE_pars = DE_pars[1,],
@@ -85,7 +84,6 @@ hmde_export_test_data <- function(n_obs_per_ind,
obs_index = rep(1:n_obs_per_ind, times = n_ind), #Vector indexed by n_obs
time = rep(time, times=n_ind), #Vector indexed by n_obs
ind_id = sort(rep(1:n_ind, times = n_obs_per_ind)), #Vector indexed by n_obs
- y_0_obs = y_obs[seq(from = 1, to=n_ind*n_obs_per_ind, by=n_obs_per_ind)], #Vector indexed by n_ind
n_pars = ncol(DE_pars),
multi_true_data = list(
DE_pars = DE_pars,
@@ -102,7 +100,6 @@ hmde_export_test_data <- function(n_obs_per_ind,
y_obs = y_obs[1:n_obs_per_ind], #Vector indexed by n_obs
obs_index = 1:n_obs_per_ind, #Vector indexed by n_obs
time = time, #Vector indexed by n_obs
- y_0_obs = y_obs[1], #Number
y_bar = y_bar, #Real
n_pars = ncol(DE_pars), #Number
single_true_data = list(
@@ -120,7 +117,6 @@ hmde_export_test_data <- function(n_obs_per_ind,
obs_index = rep(1:n_obs_per_ind, times = n_ind), #Vector indexed by n_obs
time = rep(time, times=n_ind), #Vector indexed by n_obs
ind_id = sort(rep(1:n_ind, times = n_obs_per_ind)), #Vector indexed by n_obs
- y_0_obs = y_obs[seq(from = 1, to=n_ind*n_obs_per_ind, by=n_obs_per_ind)], #Vector indexed by n_ind
y_bar = y_bar, #Real
n_pars = ncol(DE_pars),
multi_true_data = list(
@@ -137,7 +133,6 @@ hmde_export_test_data <- function(n_obs_per_ind,
y_obs = y_obs[1:n_obs_per_ind], #Vector indexed by n_obs
obs_index = 1:n_obs_per_ind, #Vector indexed by n_obs
time = time, #Vector indexed by n_obs
- y_0_obs = y_obs[1], #Number
n_pars = ncol(DE_pars), #Number
single_true_data = list(
DE_pars = DE_pars[1,],
@@ -154,7 +149,6 @@ hmde_export_test_data <- function(n_obs_per_ind,
obs_index = rep(1:n_obs_per_ind, times = n_ind), #Vector indexed by n_obs
time = rep(time, times=n_ind), #Vector indexed by n_obs
ind_id = sort(rep(1:n_ind, times = n_obs_per_ind)), #Vector indexed by n_obs
- y_0_obs = y_obs[seq(from = 1, to=n_ind*n_obs_per_ind, by=n_obs_per_ind)], #Vector indexed by n_ind
n_pars = ncol(DE_pars),
multi_true_data = list(
DE_pars = DE_pars,
diff --git a/tests/testthat/helper-testing.R b/tests/testthat/helper-testing.R
index 5681c0d..2aa9078 100644
--- a/tests/testthat/helper-testing.R
+++ b/tests/testthat/helper-testing.R
@@ -24,8 +24,7 @@ hmde_test_single_individual <- function(model_name,
hmde_assign_data(n_obs = data$n_obs, #integer
y_obs = data$y_obs,
obs_index = data$obs_index, #vector length N_obs
- time = data$time, #Vector length N_obs
- y_0_obs = data$y_0_obs #vector length N_ind
+ time = data$time #Vector length N_obs
) |>
hmde_run(chains = 1, iter = 1000, verbose = FALSE, show_messages = FALSE)
)
@@ -37,8 +36,7 @@ hmde_test_single_individual <- function(model_name,
y_obs = data$y_obs,
obs_index = data$obs_index, #vector length N_obs
time = data$time, #Vector length N_obs
- y_bar = data$y_bar, #Real
- y_0_obs = data$y_0_obs #vector length N_ind
+ y_bar = data$y_bar #Real
) |>
hmde_run(chains = 1, iter = 1000, verbose = FALSE, show_messages = FALSE)
)
@@ -63,6 +61,8 @@ hmde_test_single_individual <- function(model_name,
}
hmde_test_multi_individual <- function(model_name, data, est_dim){
+ iter <- 20
+
if(! is.null(data$step_size)){
if(is.null(data$y_bar)){ #Models that do not require centering
# Test multi-individual
@@ -74,10 +74,9 @@ hmde_test_multi_individual <- function(model_name, data, est_dim){
y_obs = data$y_obs, #vector length N_obs
obs_index = data$obs_index, #vector length N_obs
time = data$time, #Vector length N_obs
- ind_id = data$ind_id, #Vector length N_obs
- y_0_obs = data$y_0_obs #vector length N_ind
+ ind_id = data$ind_id #Vector length N_obs
) |>
- hmde_run(chains = 2, iter = 100, verbose = FALSE, show_messages = FALSE)
+ hmde_run(chains = 2, iter = iter, verbose = FALSE, show_messages = FALSE)
)
} else { #Models that do require centering with y_bar
# Test multi-individual
@@ -90,10 +89,9 @@ hmde_test_multi_individual <- function(model_name, data, est_dim){
obs_index = data$obs_index, #vector length N_obs
time = data$time, #Vector length N_obs
y_bar = data$y_bar, #Real
- ind_id = data$ind_id, #Vector length N_obs
- y_0_obs = data$y_0_obs #vector length N_ind
+ ind_id = data$ind_id #Vector length N_obs
) |>
- hmde_run(chains = 2, iter = 100, verbose = FALSE, show_messages = FALSE)
+ hmde_run(chains = 2, iter = iter, verbose = FALSE, show_messages = FALSE)
)
}
@@ -106,16 +104,15 @@ hmde_test_multi_individual <- function(model_name, data, est_dim){
y_obs = data$y_obs, #vector length N_obs
obs_index = data$obs_index, #vector length N_obs
time = data$time, #Vector length N_obs
- ind_id = data$ind_id, #Vector length N_obs
- y_0_obs = data$y_0_obs #vector length N_ind
+ ind_id = data$ind_id #Vector length N_obs
) |>
- hmde_run(chains = 2, iter = 100, verbose = FALSE, show_messages = FALSE)
+ hmde_run(chains = 2, iter = iter, verbose = FALSE, show_messages = FALSE)
)
}
# Extract samples
multi_samples <- rstan::extract(multi_ind_test, permuted = FALSE, inc_warmup = TRUE)
- expect_equal(dim(multi_samples), c(100, 2, est_dim))
+ expect_equal(dim(multi_samples), c(iter, 2, est_dim))
expect_visible(multi_ind_test)
expect_s4_class(multi_ind_test, "stanfit")
diff --git a/tests/testthat/test-hmde_assign_data.R b/tests/testthat/test-hmde_assign_data.R
index 93cd1b4..3c3fbb4 100644
--- a/tests/testthat/test-hmde_assign_data.R
+++ b/tests/testthat/test-hmde_assign_data.R
@@ -3,8 +3,7 @@ test_that("Execution and output: Constant single ind", {
hmde_assign_data(n_obs = 2,
y_obs = c(1,2),
obs_index = c(1,2),
- time = c(0,1),
- y_0_obs = 1)
+ time = c(0,1))
expect_named(constant_single)
@@ -27,8 +26,7 @@ test_that("Execution and output: Constant multi ind manual input", {
y_obs = Trout_Size_Data$y_obs, #vector length N_obs
obs_index = Trout_Size_Data$obs_index, #vector length N_obs
time = Trout_Size_Data$time, #Vector length N_obs
- ind_id = Trout_Size_Data$ind_id, #Vector length N_obs
- y_0_obs = Trout_Size_Data$y_obs[which(Trout_Size_Data$obs_index == 1)] #Vector length N_ind
+ ind_id = Trout_Size_Data$ind_id #Vector length N_obs
)
expect_named(constant_multi)
@@ -121,3 +119,39 @@ test_that("Execution and output: bad input", {
hmde_assign_data(data = mtcars)
)
})
+
+
+test_that("Execution and output: prior values", {
+ time <- 1:5
+ y_obs <- 1:5
+ obs_index <- 1:5
+
+ default_used <- hmde_model("affine_single_ind") |>
+ hmde_assign_data(n_obs = length(y_obs),
+ y_obs= y_obs,
+ obs_index = obs_index,
+ time = time,
+ y_bar = mean(y_obs),
+ step_size = 1,
+ int_method = 1)
+
+ expect_equal(default_used$prior_pars_ind_const, c(1,2))
+
+ value_supplied <- hmde_model("affine_single_ind") |>
+ hmde_assign_data(n_obs = length(y_obs),
+ y_obs= y_obs,
+ obs_index = obs_index,
+ time = time,
+ y_bar = mean(y_obs),
+ step_size = 1,
+ int_method = 1,
+ prior_pars_ind_const = c(5,5))
+
+ expect_equal(value_supplied$prior_pars_ind_const, c(5,5))
+
+ value_supplied_tibble_used <- hmde_model("constant_multi_ind") |>
+ hmde_assign_data(data = Trout_Size_Data,
+ prior_pars_pop_beta_mu = c(1, 3))
+
+ expect_equal(value_supplied_tibble_used$prior_pars_pop_beta_mu, c(1, 3))
+})
diff --git a/tests/testthat/test-hmde_extract_estimates.R b/tests/testthat/test-hmde_extract_estimates.R
index f14202f..3f54d5f 100644
--- a/tests/testthat/test-hmde_extract_estimates.R
+++ b/tests/testthat/test-hmde_extract_estimates.R
@@ -7,16 +7,13 @@ test_that("Execution and output: extract_estimates function", {
)
suppressWarnings(
- constant_single_fit <- hmde_model("constant_single_ind") |>
+ output <- hmde_model("constant_single_ind") |>
hmde_assign_data(data = data) |>
hmde_run(chains = 1, iter = 20, cores = 1,
- verbose = FALSE, show_messages = FALSE)
+ verbose = FALSE, show_messages = FALSE) |>
+ hmde_extract_estimates(input_measurement_data = data)
)
- output <- hmde_extract_estimates(model = "constant_single_ind",
- fit = constant_single_fit,
- input_measurement_data = data)
-
expect_named(output)
expect_visible(output)
diff --git a/tests/testthat/test-hmde_models_affine.R b/tests/testthat/test-hmde_models_affine.R
new file mode 100644
index 0000000..f1110d3
--- /dev/null
+++ b/tests/testthat/test-hmde_models_affine.R
@@ -0,0 +1,10 @@
+#Testing for affine model
+test_that("Model structures: affine", {
+ # Single individual
+ single_model <- hmde_model("affine_single_ind")
+ expect_named(single_model, c("step_size", "n_obs", "y_obs", "obs_index",
+ "time", "int_method", "y_bar", "prior_pars_ind_const",
+ "prior_pars_ind_beta_1", "model"))
+ expect_type(single_model, "list")
+ expect_visible(single_model)
+})
diff --git a/tests/testthat/test-hmde_models_canham.R b/tests/testthat/test-hmde_models_canham.R
index e3c1bfb..5295572 100644
--- a/tests/testthat/test-hmde_models_canham.R
+++ b/tests/testthat/test-hmde_models_canham.R
@@ -3,7 +3,11 @@ test_that("Model structures: canham", {
# Single individual
single_model <- hmde_model("canham_single_ind")
expect_named(single_model, c("n_obs", "y_obs",
- "obs_index", "time", "y_0_obs",
+ "obs_index", "time",
+ "prior_pars_ind_max_growth",
+ "prior_pars_ind_size_at_max_growth",
+ "prior_pars_ind_k",
+ "prior_pars_global_error_sigma",
"model"))
expect_type(single_model, "list")
expect_visible(single_model)
@@ -11,7 +15,14 @@ test_that("Model structures: canham", {
#Multiple individuals
multi_model <- hmde_model("canham_multi_ind")
expect_named(multi_model, c("n_obs", "n_ind", "y_obs",
- "obs_index", "time", "ind_id", "y_0_obs",
+ "obs_index", "time", "ind_id",
+ "prior_pars_pop_max_growth_mean",
+ "prior_pars_pop_max_growth_sd",
+ "prior_pars_pop_size_at_max_growth_mean",
+ "prior_pars_pop_size_at_max_growth_sd",
+ "prior_pars_pop_k_mean",
+ "prior_pars_pop_k_sd",
+ "prior_pars_global_error_sigma",
"model"))
expect_type(multi_model, "list")
expect_visible(multi_model)
@@ -37,6 +48,7 @@ test_that("Execution: canham multiple individuals", {
1 + #Global error
data$n_obs + #y_ij
1 + #y_temp
+ 14 + #checks for priors
1 #lp__
hmde_test_multi_individual(model_name, data, est_dim)
diff --git a/tests/testthat/test-hmde_models_constant.R b/tests/testthat/test-hmde_models_constant.R
index 51c623f..b595554 100644
--- a/tests/testthat/test-hmde_models_constant.R
+++ b/tests/testthat/test-hmde_models_constant.R
@@ -3,7 +3,9 @@ test_that("Model structures: Constant", {
# Single individual
single_model <- hmde_model("constant_single_ind")
expect_named(single_model, c("n_obs", "y_obs",
- "obs_index", "time", "y_0_obs",
+ "obs_index", "time",
+ "prior_pars_ind_beta",
+ "prior_pars_global_error_sigma",
"model"))
expect_type(single_model, "list")
expect_visible(single_model)
@@ -11,7 +13,10 @@ test_that("Model structures: Constant", {
#Multiple individuals
multi_model <- hmde_model("constant_multi_ind")
expect_named(multi_model, c("n_obs", "n_ind", "y_obs",
- "obs_index", "time", "ind_id", "y_0_obs",
+ "obs_index", "time", "ind_id",
+ "prior_pars_pop_beta_mu",
+ "prior_pars_pop_beta_sigma",
+ "prior_pars_global_error_sigma",
"model"))
expect_type(multi_model, "list")
expect_visible(multi_model)
@@ -36,7 +41,7 @@ test_that("Execution: Constant multiple individuals", {
data$n_pars * 2 + #Population parameters
1 + #Global error
data$n_obs + #y_ij
- data$n_obs + #Delta y_ij
+ 6 + #checks for priors
1 #lp__
hmde_test_multi_individual(model_name, data, est_dim)
diff --git a/tests/testthat/test-hmde_models_linear.R b/tests/testthat/test-hmde_models_linear.R
deleted file mode 100644
index 0ffc1a4..0000000
--- a/tests/testthat/test-hmde_models_linear.R
+++ /dev/null
@@ -1,9 +0,0 @@
-#Testing for linear model
-test_that("Model structures: linear", {
- # Single individual
- single_model <- hmde_model("linear_single_ind")
- expect_named(single_model, c("step_size", "n_obs", "y_obs", "obs_index",
- "time", "y_0_obs", "y_bar", "model"))
- expect_type(single_model, "list")
- expect_visible(single_model)
-})
diff --git a/tests/testthat/test-hmde_models_vb.R b/tests/testthat/test-hmde_models_vb.R
index 3541c64..dba96bb 100644
--- a/tests/testthat/test-hmde_models_vb.R
+++ b/tests/testthat/test-hmde_models_vb.R
@@ -3,8 +3,12 @@ test_that("Model structures: vb", {
# Single individual
single_model <- hmde_model("vb_single_ind")
expect_named(single_model, c("n_obs", "y_obs",
- "obs_index", "time", "y_0_obs",
- "y_bar", "model"))
+ "obs_index", "time",
+ "y_bar",
+ "prior_pars_ind_max_size_sd_only",
+ "prior_pars_ind_growth_rate",
+ "prior_pars_global_error_sigma",
+ "model"))
expect_type(single_model, "list")
expect_visible(single_model)
@@ -12,7 +16,13 @@ test_that("Model structures: vb", {
multi_model <- hmde_model("vb_multi_ind")
expect_named(multi_model, c("n_obs", "n_ind", "y_obs",
"obs_index", "time", "ind_id",
- "y_0_obs", "y_bar", "model"))
+ "y_bar",
+ "prior_pars_pop_max_size_mean_sd_only",
+ "prior_pars_pop_max_size_sd",
+ "prior_pars_pop_growth_rate_mean",
+ "prior_pars_pop_growth_rate_sd",
+ "prior_pars_global_error_sigma",
+ "model"))
expect_type(multi_model, "list")
expect_visible(multi_model)
})
@@ -32,15 +42,13 @@ test_that("Execution: vb multiple individuals", {
#Dimension is:
est_dim <- data$n_ind + #Initial condition
- (data$n_pars-1) * data$n_ind + #Individual parameters, -1 as y_bar not estimated
- (data$n_pars-1) * 2 + #Population parameters
+ data$n_pars * data$n_ind + #Individual parameters
+ data$n_pars * 2 + #Population parameters
1 + #Global error
data$n_obs + #y_ij
- data$n_obs + #Delta y_ij
- (data$n_pars) + #Pars temp vector
- 1 + #lp__
- 1 + #temp_y_final
- 6 #Generated quantities
+ data$n_pars + 1 + #Pars temp vector
+ 10 + #checks for priors
+ 1 #lp__
hmde_test_multi_individual(model_name, data, est_dim)
})
diff --git a/tests/testthat/test-hmde_plot_de_pieces.R b/tests/testthat/test-hmde_plot_de_pieces.R
index 9d7dfbd..d0de442 100644
--- a/tests/testthat/test-hmde_plot_de_pieces.R
+++ b/tests/testthat/test-hmde_plot_de_pieces.R
@@ -6,8 +6,7 @@ test_that("Execution and output: plot_de_pieces function", {
verbose = FALSE, show_messages = FALSE)
)
- output <- hmde_extract_estimates(model = "constant_multi_ind",
- fit = multi_ind_trout,
+ output <- hmde_extract_estimates(fit = multi_ind_trout,
input_measurement_data = Trout_Size_Data)
plot <- hmde_plot_de_pieces(model = "constant_multi_ind",
@@ -33,8 +32,7 @@ test_that("Execution and output: bad input", {
verbose = FALSE, show_messages = FALSE)
)
- output <- hmde_extract_estimates(model = "constant_multi_ind",
- fit = multi_ind_trout,
+ output <- hmde_extract_estimates(fit = multi_ind_trout,
input_measurement_data = Trout_Size_Data)
expect_error(
diff --git a/tests/testthat/test-hmde_plot_obs_est_inds.R b/tests/testthat/test-hmde_plot_obs_est_inds.R
new file mode 100644
index 0000000..c98b1af
--- /dev/null
+++ b/tests/testthat/test-hmde_plot_obs_est_inds.R
@@ -0,0 +1,30 @@
+test_that("Execution and output: plot_obs_est_inds function", {
+ plot <- hmde_plot_obs_est_inds(n_ind_to_plot = 2,
+ measurement_data = Tree_Size_Ests$measurement_data)
+
+ expect_named(plot)
+
+ expect_visible(plot)
+
+ expect_type(plot, "list")
+})
+
+
+test_that("Execution and output: bad input", {
+ expect_error(
+ hmde_plot_obs_est_inds(measurement_data = Tree_Size_Ests$measurement_data)
+ )
+
+ expect_error(
+ hmde_plot_obs_est_inds()
+ )
+
+ expect_error(
+ hmde_plot_obs_est_inds(individual_data = Tree_Size_Ests$measurement_data)
+ )
+
+ expect_error(
+ hmde_plot_obs_est_inds(n_ind_to_plot = 10^3,
+ measurement_data = Tree_Size_Ests$measurement_data)
+ )
+})
diff --git a/vignettes/canham.Rmd b/vignettes/canham.Rmd
index 9318c75..59abd61 100644
--- a/vignettes/canham.Rmd
+++ b/vignettes/canham.Rmd
@@ -3,10 +3,9 @@ title: "Case study 3: Canham function growth with tree data from Barro Colorado
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{canham}
- %\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
-editor_options:
- chunk_output_type: console
+ %\VignetteEncoding{UTF-8}
+bibliography: vignette.bib
---
```{r, include = FALSE}
@@ -15,6 +14,8 @@ knitr::opts_chunk$set(
comment = "#>",
eval = TRUE
)
+
+options(rmarkdown.html_vignette.check_title = FALSE)
```
```{r}
@@ -23,13 +24,13 @@ library(dplyr)
library(ggplot2)
```
-Our final case study reproduces analysis from \cite{obrien2024allindividuals} with a small sample size for G. recondita in order to make the model tractable for a demonstration.
+Our final case study reproduces analysis from @obrien2024allindividuals with a small sample size for G. recondita in order to make the model tractable for a demonstration.
-The function we use here is based on \cite{canham2004neighborhood}, and is a three parameter non-linear ODE given by
+The function we use here is based on @canham2004neighborhood, and is a three parameter non-linear ODE given by
$$g(S(t);\, g_{max}, S_{max}, k) = g_{max} \exp \Bigg(-\frac{1}{2}\bigg(\frac{\ln(S(t)/S_{max})}{k} \bigg)^2 \Bigg).$$
The Canham function, as we refer to it, describes hum-shaped growth that accelerates to a peak at $(S_{max}, g_{max})$ then declines to 0 at a rate controlled by $k$. The Canham function does not have an analytic solution the way that the previous function did, so we are unable to directly encode the sizes over time and must instead use a numerical method.
-The Canham function has been used for a lot of growth analysis such as \cite{herault2011functional} and \cite{canham2006neighborhood} as well as our own previous work in \cite{obrien2024allindividuals} and [Paper 2]. The desirable features of the Canham growth model are that it has a period of increasing growth at small sizes, with a finite growth peak at $(S_{max}, g_{max})$, and then decay to near-zero growth. With different parameter combinations Canham can fit a range of growth behaviours to an observed interval: increasing growth, decreasing growth, a growth spike, or steady growth. The downside is that the Canham function is unimodal -- it can only fit a single peak and as such is not suitable to describe the full life history of species that are strongly responsive to their environment as seen in [Paper 2].
+The Canham function has been used for a lot of growth analysis such as @herault2011functional and @canham2006neighborhood as well as our own previous work in @obrien2024allindividuals and [Paper 2]. The desirable features of the Canham growth model are that it has a period of increasing growth at small sizes, with a finite growth peak at $(S_{max}, g_{max})$, and then decay to near-zero growth. With different parameter combinations Canham can fit a range of growth behaviours to an observed interval: increasing growth, decreasing growth, a growth spike, or steady growth. The downside is that the Canham function is unimodal -- it can only fit a single peak and as such is not suitable to describe the full life history of species that are strongly responsive to their environment as seen in [Paper 2].
The next bit of code plots the Canham function for chosen parameter values. We have provided some, but encourage playing around with the parameters and seeing what happens to the function.
```{r}
@@ -52,7 +53,26 @@ ggplot() +
xlim=c(y_0, y_final))
```
-The data we use for this demonstration comes from the Barro Colorado Island long term forest plot \cite{condit2019dataset}. We have a simple random sample without replacement of 50 individuals from [species], subject to the following inclusion criteria for individuals:
+The default priors for the Canham top-level parameters for the single individual model are
+$$g_{max} \sim \log\mathcal{N}(0, 2),$$
+$$S_{max} \sim \log\mathcal{N}(0, 2),$$
+$$k \sim \log\mathcal{N}(0, 2)$$
+$$0 <\sigma_e \sim Cauchy(0, 2).$$
+For the multi-individual model the prior structure and default parameters are
+$$\mu_{g_{max}} \sim \mathcal{N}(0, 2),$$
+$$0 < \sigma_{g_{max}} \sim Cauchy(0, 2),$$
+$$\mu_{S_{max}} \sim \mathcal{N}(0, 2),$$
+$$0 < \sigma_{S_{max}} \sim Cauchy(0, 2),$$
+$$\mu_{k} \sim \mathcal{N}(0, 2),$$
+$$0 < \sigma_{k} \sim \sim Cauchy(0, 2),$$
+$$0 <\sigma_e \sim Cauchy(0, 2).$$
+To see the name for the prior parameter run `hmde_model`. For example in the following we want to change the prior for $g_{max}$ (`ind_max_growth`) in the individual model:
+```{r}
+hmde_model("canham_single_ind")
+#prior_pars_ind_max_growth is the argument name for the prior parameters
+```
+
+The data we use for this demonstration comes from the Barro Colorado Island long term forest plot @condit2019dataset. We have a simple random sample without replacement of 50 individuals from \textit{Garcinia recondita}, subject to the following inclusion criteria for individuals:
\begin{itemize}
\item 6 observations since the 1990 change in measurement precision,
\item Consistent stem and tree ID matching,
@@ -75,19 +95,18 @@ hist(Tree_Size_Data_Transformed$Delta_y_obs,
xlab = "Growth increment (cm)", main="")
```
-Due to the complexity of the Canham model the sampling can take up to 3 hours. If you decide to run your own samples we recommend saving the model outputs using `saveRDS()` so you don't need to rerun your model every time. As is, the following block will not run by default as we provide a data set of estimates with the package in the `Tree_Size_Ests` data file which can be accessed directly.
+Due to the complexity of the Canham model the sampling can take up to 3 hours. If you decide to run your own samples we recommend saving the model outputs using `saveRDS()` so you don't need to rerun your model every time. We provide a data set of estimates with the package in the `Tree_Size_Ests` data file which can be accessed directly.
```{r, eval=FALSE}
tree_canham_fit <- hmde_model("canham_multi_ind") |>
hmde_assign_data(data = Tree_Size_Data) |>
hmde_run(chains = 4, cores = 4, iter = 2000)
-Tree_Size_Ests <- hmde_extract_estimates(model = "canham_multi_ind",
- fit = tree_canham_fit,
+tree_canham_estimates <- hmde_extract_estimates(fit = tree_canham_fit,
input_measurement_data = Tree_Size_Data)
-saveRDS(tree_canham_fit, "inst/extdata/tree_canham_fit.rds")
-saveRDS(Tree_Size_Ests, "inst/extdata/Tree_Size_Ests.rds")
+#saveRDS(tree_canham_fit, "tree_canham_fit.rds")
+#saveRDS(tree_canham_estimates, "tree_canham_estimates.rds")
```
The model analysis follows the same workflow as the previous demonstrations: we look at how individual sizes over time and fitted growth functions behave, then examine evidence of relationships between parameter values at the individual level. We also look at the species-level parameter CIs and estimates.
@@ -105,133 +124,173 @@ measurement_data_transformed <- Tree_Size_Ests$measurement_data %>%
ungroup()
#Distributions of estimated growth and size
-hist(measurement_data_transformed$y_hat,
- main = "Estimated size distribution",
- xlab = "Size (cm)")
-hist(measurement_data_transformed$delta_y_est,
- main = "Estimated growth increments",
- xlab = "Growth increment (cm)")
-hist(measurement_data_transformed$est_growth_rate,
- main = "Estimated annualised growth rate distribution",
- xlab = "Growth rate (cm/yr)")
+ggplot(measurement_data_transformed,
+ aes(y_hat)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="Size (cm)",
+ title = "Estimated size distribution") +
+ theme_classic()
+
+ggplot(measurement_data_transformed,
+ aes(delta_y_est)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x = "Growth increment (cm)",
+ title = "Estimated growth increments") +
+ theme_classic()
+
+ggplot(measurement_data_transformed,
+ aes(est_growth_rate)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x = "Growth rate (cm/yr)",
+ title = "Estimated annualised growth rate distribution") +
+ theme_classic()
#Quantitative R^2
cor(measurement_data_transformed$y_obs, measurement_data_transformed$y_hat)^2
-#Plots of size over time for a sample of 5 individuals
-sample_ids <- sample(1:nrow(Tree_Size_Ests$individual_data), size=5)
-plot_data <- measurement_data_transformed %>%
- filter(ind_id %in% sample_ids)
-
-ggplot(data=plot_data, aes(group = ind_id)) +
- geom_point(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- shape = 1) +
- geom_line(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- linetype = "dashed") +
- geom_point(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- shape = 2) +
- geom_line(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- linetype = "solid") +
- labs(x="Time (years)", y="DBH (cm)", colour="Ind. ID") +
+r_sq_est <- cor(Tree_Size_Ests$measurement_data$y_obs,
+ Tree_Size_Ests$measurement_data$y_hat)^2
+r_sq <- paste0("R^2 = ",
+ signif(r_sq_est,
+ digits = 3))
+
+obs_est_scatter <- ggplot(data = Tree_Size_Ests$measurement_data,
+ aes(x = y_obs, y = y_hat)) +
+ geom_point(shape = 16, size = 1, colour = "green4") +
+ xlab("Y obs.") +
+ ylab("Y est.") +
+ geom_abline(slope = 1, linetype = "dashed") +
+ annotate("text", x = 7, y = 22,
+ label = r_sq) +
theme_classic()
+obs_est_scatter
+
+#Plots of size over time for a sample of 5 individuals
+obs_est_size_plot <- hmde_plot_obs_est_inds(n_ind_to_plot = 5,
+ measurement_data = Tree_Size_Ests$measurement_data) +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.1, 0.85)) +
+ guides(colour=guide_legend(nrow=2,byrow=TRUE))
+obs_est_size_plot
+
```
Individual parameter analysis, growth function plots follow this.
```{r}
#1-dimensional parameter distributions
-hist(Tree_Size_Ests$individual_data$ind_max_growth_mean,
- main = "Individual max growth rate parameters",
- xlab = "g_max estimate")
+ggplot(Tree_Size_Ests$individual_data,
+ aes(ind_max_growth_mean)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="g_max estimate") +
+ theme_classic()
-hist(Tree_Size_Ests$individual_data$ind_size_at_max_growth_mean,
- main = "Individual size at max growth parameters",
- xlab = "S_max estimate")
+ggplot(Tree_Size_Ests$individual_data,
+ aes(ind_size_at_max_growth_mean)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="S_max estimate") +
+ theme_classic()
-hist(Tree_Size_Ests$individual_data$ind_max_growth_mean,
- main = "Individual spread parameters",
- xlab = "k estimate")
+ggplot(Tree_Size_Ests$individual_data,
+ aes(ind_k_mean)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="k estimate") +
+ theme_classic()
#2-dimensional parameter distributions
-ggplot(data = Tree_Size_Ests$individual_data,
+pairplot1 <- ggplot(data = Tree_Size_Ests$individual_data,
aes(x = ind_max_growth_mean, y = ind_size_at_max_growth_mean)) +
geom_point(shape = 16, size = 1, colour = "green4") +
- xlab("Individual max growth (cm/yr) g_max") +
- ylab("Individual size at max growth (cm) S_max") +
+ xlab("Ind. max growth (cm/yr)") +
+ ylab("Ind. size at max growth (cm)") +
theme_classic()
-ggplot(data = Tree_Size_Ests$individual_data,
+pairplot2 <- ggplot(data = Tree_Size_Ests$individual_data,
aes(x = ind_max_growth_mean, y = ind_k_mean)) +
geom_point(shape = 16, size = 1, colour = "green4") +
- xlab("Individual max growth (cm/yr) g_max") +
- ylab("Individual spread parameters k") +
+ xlab("Ind. max growth (cm/yr)") +
+ ylab("Ind. spread par.") +
theme_classic()
-ggplot(data = Tree_Size_Ests$individual_data,
+pairplot3 <- ggplot(data = Tree_Size_Ests$individual_data,
aes(x = ind_k_mean, y = ind_size_at_max_growth_mean)) +
geom_point(shape = 16, size = 1, colour = "green4") +
- xlab("Individual spread parameters k") +
- ylab("Individual size at max growth (cm) S_max") +
+ xlab("Ind. spread par.") +
+ ylab("Ind. size at max growth (cm)") +
theme_classic()
-#Linear correlation of parameters
-cor(Tree_Size_Ests$individual_data[,c(2,6,10)])
+pairplot1
+pairplot2
+pairplot3
+
+#monotonic correlation of parameters
+cor(Tree_Size_Ests$individual_data[,c(2,6,10)], method="spearman")
#Plot function pieces over estimated sizes.
-hmde_plot_de_pieces(model = "canham_multi_ind",
+est_de_plot <- hmde_plot_de_pieces(model = "canham_multi_ind",
individual_data = Tree_Size_Ests$individual_data,
measurement_data = Tree_Size_Ests$measurement_data)
+est_de_plot
```
At the hyper-parameter level for the whole population we have centre and spread parameters for the log-normal distributions of $g_{max}$, $S_{max}$ and $k$.
```{r}
-#Max growth
-Tree_Size_Ests$population_data$mean[1] #Raw value
-print(paste0("95% CI for mean log max growth rate: (",
- Tree_Size_Ests$population_data$CI_lower[1], " , ",
- Tree_Size_Ests$population_data$CI_upper[1], ")")) #Raw CI
-
-exp(Tree_Size_Ests$population_data$mean[1]) #In cm units
-print(paste0("95% CI for mean max growth rate in cm/yr: (",
- exp(Tree_Size_Ests$population_data$CI_lower[1]), " , ",
- exp(Tree_Size_Ests$population_data$CI_upper[1]), ")"))
-
-#Standard deviation of underlying normal distribution
-Tree_Size_Ests$population_data$mean[2]
-print(paste0("95% CI for log max growth rate standard deviation: (",
- Tree_Size_Ests$population_data$CI_lower[2], " , ",
- Tree_Size_Ests$population_data$CI_upper[2], ")")) #Raw CI
-
-#size at max growth
-Tree_Size_Ests$population_data$mean[3] #Raw value
-print(paste0("95% CI for mean log size at max growth rate: (",
- Tree_Size_Ests$population_data$CI_lower[3], " , ",
- Tree_Size_Ests$population_data$CI_upper[3], ")")) #Raw CI
-
-exp(Tree_Size_Ests$population_data$mean[3]) #In cm units
-print(paste0("95% CI for mean max size at growth rate in cm: (",
- exp(Tree_Size_Ests$population_data$CI_lower[3]), " , ",
- exp(Tree_Size_Ests$population_data$CI_upper[3]), ")"))
-
-#Standard deviation of underlying normal distribution
-Tree_Size_Ests$population_data$mean[4]
-print(paste0("95% CI for log max size at growth rate standard deviation: (",
- Tree_Size_Ests$population_data$CI_lower[4], " , ",
- Tree_Size_Ests$population_data$CI_upper[4], ")")) #Raw CI
-
-#Spread parameter k
-Tree_Size_Ests$population_data$mean[5] #Raw value
-print(paste0("95% CI for mean log spread parameter: (",
- Tree_Size_Ests$population_data$CI_lower[5], " , ",
- Tree_Size_Ests$population_data$CI_upper[5], ")")) #Raw CI
-
-exp(Tree_Size_Ests$population_data$mean[5]) #In cm units
-print(paste0("95% CI for mean spread parameter: (",
- exp(Tree_Size_Ests$population_data$CI_lower[5]), " , ",
- exp(Tree_Size_Ests$population_data$CI_upper[5]), ")"))
-
-#Standard deviation of underlying normal distribution
-Tree_Size_Ests$population_data$mean[6]
-print(paste0("95% CI for log spread parameter standard deviation: (",
- Tree_Size_Ests$population_data$CI_lower[6], " , ",
- Tree_Size_Ests$population_data$CI_upper[], ")")) #Raw CI
+pars_CI_names <- c(
+ "mean log max growth rate",
+ "mean max growth rate in cm/yr",
+ "log max growth rate standard deviation",
+ "mean log size at max growth rate",
+ "mean max size at max growth rate in cm",
+ "log max size at growth rate standard deviation",
+ "mean mean log spread parameter",
+ "mean mean spread parameter",
+ "log spread parameter standard deviation"
+)
+
+#Vector that picks out which pars to be exponentiated
+exp_vec <- c(FALSE, TRUE, FALSE,
+ FALSE, TRUE, FALSE,
+ FALSE, TRUE, FALSE)
+
+#Print mean estimates and CIs
+for(i in 1:nrow(Tree_Size_Ests$population_data)){
+ if(!exp_vec[i]){
+ print(paste0(pars_CI_names[i],
+ " estimate: ",
+ Tree_Size_Ests$population_data$mean[i] ))
+ print(paste0("95% CI for ",
+ pars_CI_names[i],
+ ": (",
+ Tree_Size_Ests$population_data$CI_lower[i],
+ ", ",
+ Tree_Size_Ests$population_data$CI_upper[i],
+ ")"))
+ } else {
+ print(paste0(pars_CI_names[i],
+ " estimate: ",
+ exp(Tree_Size_Ests$population_data$mean[i])))
+ print(paste0("95% CI for ",
+ pars_CI_names[i],
+ ": (",
+ exp(Tree_Size_Ests$population_data$CI_lower[i]),
+ ", ",
+ exp(Tree_Size_Ests$population_data$CI_upper[i]),
+ ")"))
+ }
+}
```
+
+
+## References
+
diff --git a/vignettes/constant-growth.Rmd b/vignettes/constant-growth.Rmd
index ed4a69e..ab52286 100644
--- a/vignettes/constant-growth.Rmd
+++ b/vignettes/constant-growth.Rmd
@@ -3,18 +3,23 @@ title: "Case study 1: Constant growth with SUSTAIN Trout data"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{constant-growth}
- %\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
-editor_options:
- chunk_output_type: console
+ %\VignetteEncoding{UTF-8}
+bibliography: vignette.bib
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
- fig.align='center'
+ fig.align = 'center',
+ message = FALSE,
+ warning = FALSE
)
+
+library(patchwork)
+
+options(rmarkdown.html_vignette.check_title = FALSE)
```
## Overview
@@ -25,8 +30,6 @@ f(Y(t), \beta) = \beta
\end{equation}
where $\beta$ is the average growth rate. The constant growth model corresponds to linear sizes over time, and is equivalent to a linear mixed model for size, where there is an individual effect when fit to multiple individuals.
-Let's simulate some data to visualise the constant growth function.
-
### Load dependencies
```{r setup}
@@ -38,17 +41,41 @@ library(dplyr)
library(ggplot2)
```
-### Visualise data
+### Priors
+The default priors for the constant top-level parameters in the single individual model are
+$$\beta\sim \log\mathcal{N}(0, 2),$$
+$$0 <\sigma_e \sim Cauchy(0, 2).$$
-Here are some plots to demonstrate how the constant growth function relates to sizes over time for a single individual. Feel free to play around with the parameter settings (`beta`, `y_0`) and see how the plot changes.
+For the multi-individual model the prior structure and default parameters are
+$$\mu_{\beta} \sim \mathcal{N}(0, 2),$$
+$$0 < \sigma_{\beta} \sim Cauchy(0, 2),$$
+$$0 < \sigma_{e} \sim Cauchy(0, 2).$$
+To change the prior parameter values (the distributions are fixed) optional arguments can be passed to `hmde_assign_data` with names corresponding to the `prior_pars` argument for the associated parameter as output by `hmde_models()`. For example in the following we want to change the prior for $\beta$ (`ind_beta`) in the individual model:
+```{r}
+hmde_model("constant_single_ind")
+#prior_pars_ind_beta is the argument name for the prior parameters
+```
+The mean passed to a log-normal distribution is the mean of the **underlying normal distribution**, so if you want to pass a value based on raw observations you need to log-transform it first.
+
+Let's simulate some data to visualise the constant growth function.
-```{r, fig.width=3, fig.height=3}
-#Simulate data
+### Simulate data
+
+```{r}
beta <- 2 #Annual growth rate
y_0 <- 1 #Starting size
time <- c(0,20)
sizes_over_time <- tibble(Y_t = 1 + beta*time, #Linear sizes over time
t = time)
+
+sizes_over_time
+```
+
+### Visualise data
+
+Here are some plots to demonstrate how the constant growth function relates to sizes over time for a single individual. Feel free to play around with the parameter settings (`beta`, `y_0`) and see how the plot changes.
+
+```{r, echo=FALSE, fig.width=6, fig.height=3}
#Plot of growth function
constant_growth_function <- ggplot() +
xlim(y_0, max(sizes_over_time$Y_t)) +
@@ -72,11 +99,35 @@ sizes_over_time <- ggplot(data = sizes_over_time, aes(x=t, y = Y_t)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=18,face="bold"))
-constant_growth_function
-sizes_over_time
+constant_growth_function + sizes_over_time
+```
+
+```{r, eval=FALSE, fold}
+#Plot of growth function
+ggplot() +
+ geom_function(fun = hmde_model_des("constant_single_ind"), # Visualising the growth function
+ args = list(pars = list(beta)),
+ colour = "green4", linewidth=1,
+ xlim = c(y_0, max(sizes_over_time))) +
+ xlim(y_0, max(sizes_over_time$Y_t)) + # Creating the x axis
+ ylim(0, beta*2) + # Creating the y axis
+ labs(x = "Y(t)", y = "f", title = "Constant growth") + # Axe labels and plot title
+ theme_classic() + # Theme for the plot
+ theme(axis.text=element_text(size=16), # Plot customising
+ axis.title=element_text(size=18,face="bold"))
+
+#Sizes over time
+ggplot(data = sizes_over_time, aes(x=t, y = Y_t)) +
+ geom_line(colour="green4", linewidth=1) + # Line graph of sizes_over_time
+ xlim(0, max(sizes_over_time$t)) +
+ ylim(0, max(sizes_over_time$Y_t)*1.05) +
+ labs(x = "Time", y = "Y(t)", title = "Constant growth") +
+ theme_classic() +
+ theme(axis.text=element_text(size=16),
+ axis.title=element_text(size=18,face="bold"))
```
-A key take-away of the function plot is the relationship to what we think of as a "reasonable growth model". We don't expect constant growth rates to be realistic, at best they represent the average rate of change over a period. More complex models may be more realistic, but in this case study we are only interested in different mechanisms of size dependence, we do not use environmental covariates for example.
+A key take-away of the function plot (on the left) is the relationship to what we think of as a "reasonable growth model". We don't expect constant growth rates to be realistic, at best they represent the average rate of change over a period. More complex models may be more realistic, but in this case study we are only interested in different mechanisms of size dependence, we do not use environmental covariates for example.
## SUSTAIN trout data
Our example data for the constant model comes from @moe2020_TroutData, a publicly available dataset of mark-recapture data for _Salmo trutta_ in Norway. The time between observations is not controlled, nor is the number of observations per individual. As a result the data consists primarily of individuals with two observations of size, constituting a single observation of growth which limits the growth functions that can be fit to individuals as a single parameter model is the best that can be fit to two sizes. The constant growth function in Equation \eqref{eqn_3_const} is the most appropriate of the functions we have in `hmde`, as we can interpret the single growth interval as an estimate of the average growth rate that gets fit to $\beta$.
@@ -87,6 +138,8 @@ In order to best reflect the survey data we took a stratified sample of individu
Trout_Size_Data
```
+### Transform data
+
As initial exploration we will have a look at the distribution of observed sizes, growth behaviour, and observation intervals. First we transform the data to extract growth increment and observation interval information, then plot it.
```{r}
Trout_Size_Data_transformed <- Trout_Size_Data %>%
@@ -97,39 +150,94 @@ Trout_Size_Data_transformed <- Trout_Size_Data %>%
obs_growth_rate = delta_y_obs/obs_interval
) %>%
ungroup()
+
+Trout_Size_Data_transformed
```
-```{r, eval=FALSE}
-hist(Trout_Size_Data$y_obs,
- main = "Observed size distribution",
- xlab = "Size (cm)")
+### Visualise raw data
+
+Let's create some histograms to investigate the distribution of size, growth interval and growth increments.
+
+```{r, echo = FALSE, fig.width=6, fig.height=6}
+histogram_func <- function(data, var, main, xlab, ...){
+ ggplot2::ggplot(data = data, aes(x = {{var}})) +
+ geom_histogram(colour = "black", fill = "lightblue", ...) +
+ labs(title = main,
+ x= xlab) +
+ theme_classic()
+}
+
+hist_a <- histogram_func(Trout_Size_Data,
+ y_obs,
+ "Observed size distribution",
+ "Size (cm)",
+ binwidth = 5)
+
+hist_b <- histogram_func(Trout_Size_Data_transformed,
+ obs_interval,
+ "Observed interval distribution",
+ "Time (yr)",
+ binwidth = 0.55)
+
+hist_c <- histogram_func(Trout_Size_Data_transformed,
+ delta_y_obs,
+ "Observed growth increments",
+ "Growth increment (cm)",
+ binwidth = 5.5)
+
+hist_d <- histogram_func(Trout_Size_Data_transformed,
+ obs_growth_rate,
+ "Observed annualised growth \n rate distribution",
+ "Growth rate (cm/yr)",
+ binwidth = 80)
+
+hist_a + hist_b + hist_c + hist_d + plot_layout(ncol = 2)
+```
-hist(Trout_Size_Data_transformed$obs_interval,
- main = "Observation interval distribution",
- xlab = "Time (yr)")
+The growth histograms show that there's a number of negative growth increments, some reasonably extreme, and when combined with some short observation periods we get very extreme estimates of growth rates. We can further investigate these if needed.
-hist(Trout_Size_Data_transformed$delta_y_obs,
- main = "Observed growth increments",
- xlab = "Growth increment (cm)")
+The constant growth model assumes non-negative growth and uses a log-normal distribution for $\beta$, which will eliminate those increments from the estimated sizes. We consider eliminating negative growth biologically reasonable as we don't expect the length of fish to decrease over time, even if their mass or width might.
-hist(Trout_Size_Data_transformed$obs_growth_rate,
- main = "Observed annualised growth rate distribution",
- xlab = "Growth rate (cm/yr)")
+### Fit model using {hmde}
+
+Now we will actually fit the model and extract the estimates.
+
+You can see what data structures are needed for the constant growth model by calling:
+
+```{r}
+hmde_model("constant_multi_ind") |>
+ names()
```
-The growth histograms show that there's a number of negative growth increments, some reasonably extreme, and when combined with some short observation periods we get very extreme estimates of growth rates. We can further investigate these if needed. The constant growth model assumes non-negative growth and uses a log-normal distribution for $\beta$, which will eliminate those increments from the estimated sizes. We consider eliminating negative growth biologically reasonable as we don't expect the length of fish to decrease over time, even if their mass or width might.
-Now we will actually fit the model and extract the estimates. As the provided trout data is already in the form required by the hmde_assign_data function we don't need to do any further re-naming and can pass it directly.
+Each represents an element of the list that gets passed to the model:
+- `n_obs` is the integer number of observations $y_{ij}$
+- `n_ind` is the integer number of individuals
+- `y_obs` is the vector of $y_{ij}$ observations and should have length `n_obs`
+- `obs_index` is a vector containing integer the $j$ index for individual $i$, and counts which observation $y_{ij}$ is in sequence
+- `time` is a vector that determines when an observation happened relative to the first observation for that individual. The first observation has time 0
+- `ind_id` is a vector the same length as `y_obs` that tracks which individual an observation comes from
+- `model` is the name of the model
+
+Now we will actually fit the model and extract the estimates. As the provided trout data is already in the form required by the `hmde_assign_data` function we don't need to do any further re-naming and can pass it directly.
+
```{r}
trout_constant_fit <- hmde_model("constant_multi_ind") |>
hmde_assign_data(data = Trout_Size_Data) |>
hmde_run(chains = 4, cores = 1, iter = 2000)
+```
+
+### Inspect estimates
+
+Once the model has finished running, we can extract the model estimates and have a look at the distribution of estimated sizes, estimated growth increments, and annualised growth rates at the level of sizes over time.
-trout_constant_estimates <- hmde_extract_estimates(model = "constant_multi_ind",
+```{r}
+trout_constant_estimates <- hmde_extract_estimates(
fit = trout_constant_fit,
input_measurement_data = Trout_Size_Data)
```
-At the level of sizes over time we can have a look at the distribution of estimated sizes, estimated growth increments, and annualised growth rates. The negative increments are gone without actually removing those from the data set.
+First, let's do some quick data wrangling to calculate our parameters of interest.
+
```{r}
measurement_data_transformed <- trout_constant_estimates$measurement_data %>%
group_by(ind_id) %>%
@@ -141,52 +249,86 @@ measurement_data_transformed <- trout_constant_estimates$measurement_data %>%
est_growth_rate = delta_y_est/obs_interval
) %>%
ungroup()
+```
-hist(measurement_data_transformed$y_hat,
- main = "Estimated size distribution",
- xlab = "Size (cm)")
-hist(measurement_data_transformed$delta_y_est,
- main = "Estimated growth increments",
- xlab = "Growth increment (cm)")
-hist(measurement_data_transformed$est_growth_rate,
- main = "Estimated annualised growth rate distribution",
- xlab = "Growth rate (cm/yr)")
-```
-We can also directly compare the observed sizes over time to estimated values. Quantitatively we can use statistics such as $R^2$ calculated on $(y_{ij}, \hat{Y}_{ij})$, and qualitatively we can look at plots of sizes over time. We look at 5 individuals to start with because the joint plot of sizes over time can get very messy very fast.
-```{r}
+Now we can have a look at the distirbution of each of the estimated parameter by creating histograms
+
+```{r, fig.width=5, fig.height=6}
+est_hist_y_hat <- histogram_func(measurement_data_transformed, y_hat,
+ "Estimated size distribution",
+ xlab = "Size (cm)",
+ bins = 5)
+
+est_hist_delta_y_est <- histogram_func(measurement_data_transformed, delta_y_est,
+ "Estimated growth \n increments",
+ xlab = "Growth increment (cm)",
+ bins = 5)
+
+est_hist_growth_rate <- histogram_func(measurement_data_transformed, est_growth_rate,
+ "Estimated annualised growth rate distribution", xlab = "Growth rate (cm/yr)",
+ bins = 5)
+
+(est_hist_y_hat + est_hist_delta_y_est) / est_hist_growth_rate
+```
+
+We can see that the negative growth increments are gone! Because we fit a positive growth function ($f = \beta > 0$) the model cannot actually estimate negative growth increments.
+
+#### $R^2$
+
+We can also directly compare the observed sizes over time to estimated values.
+
+We can use $R^2$ calculated on $(y_{ij}, \hat{Y}_{ij})$, and inspect we can look at plots of sizes over time. The $R^2$ statistic is a metric primarily used in linear regression that measures the proportion (ie. decimal value in the [0,1] interval) of variance in one coordinate that can be explained by the regression model. In this context, we interpret it as how strongly the fitted and observed values agree. We don't expect perfect agreement -- $R^2=1$ -- because we don't get perfect agreement. @obrien2024allindividuals showed that the change between observed and fitted values can actually correct for measurement errors in size, so disagreement is not a bad thing overall.
+
+In the next block, we are looking at 5 random individuals to start with because plotting every individuals' sizes over time can get very messy.
+```{r, fig.width=4, fig.height=4}
#Quantitative R^2
cor(measurement_data_transformed$y_obs, measurement_data_transformed$y_hat)^2
+r_sq_est <- cor(trout_constant_estimates$measurement_data$y_obs,
+ trout_constant_estimates$measurement_data$y_hat)^2
+r_sq <- paste0("R^2 = ",
+ signif(r_sq_est,
+ digits = 3))
+
+obs_est_size_plot <- ggplot(data = trout_constant_estimates$measurement_data,
+ aes(x = y_obs, y = y_hat)) +
+ geom_point(shape = 16, size = 1, colour = "green4") +
+ xlab("Y obs.") +
+ ylab("Y est.") +
+ geom_abline(slope = 1, linetype = "dashed") +
+ annotate("text", x = 45, y = 80,
+ label = r_sq) +
+ theme_classic()
+obs_est_size_plot
#Plots of size over time for a sample of 5 individuals
-sample_ids <- sample(1:nrow(trout_constant_estimates$individual_data), size=5)
-plot_data <- measurement_data_transformed %>%
- filter(ind_id %in% sample_ids)
-
-ggplot(data=plot_data, aes(group = ind_id)) +
- geom_point(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- shape = 1) +
- geom_line(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- linetype = "dashed") +
- geom_point(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- shape = 2) +
- geom_line(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- linetype = "solid") +
- labs(x="Time (years)", y="Size (cm)", colour="Ind. ID") +
- theme_classic()
+size_over_time_plot <- hmde_plot_obs_est_inds(n_ind_to_plot = 5,
+ measurement_data = trout_constant_estimates$measurement_data)
+size_over_time_plot
+
```
-At the level of individuals we are interested in the distribution of $\beta$ estimates, which will align with the estimated annualised growth rates as that's precisely what they represent. We also provide an easy way to produce a plot of the fitted growth functions in order to see how they compare to the observed sizes.
-```{r}
-hist(trout_constant_estimates$individual_data$ind_beta_mean,
- main = "Individual beta parameters",
- xlab = "beta estimate")
+### Indvidual growth functions ($\beta$)
-hmde_plot_de_pieces(model = "constant_multi_ind",
+At the level of individuals we are interested in the distribution of $\beta$ estimates, which will align with the estimated annualised growth rates as that's precisely what they represent. Here is one way to visualise the fitted growth functions in order to see how they compare to the observed sizes.
+
+```{r, echo=FALSE, fig.width=6.5, fig.height=5}
+ind_hist_beta <- histogram_func(trout_constant_estimates$individual_data,
+ ind_beta_mean,
+ main = "Individual beta parameters",
+ xlab = "beta estimate")
+
+de_pieces <- hmde_plot_de_pieces(model = "constant_multi_ind",
individual_data = trout_constant_estimates$individual_data,
measurement_data = trout_constant_estimates$measurement_data)
+
+
+ind_hist_beta + de_pieces
```
+### Population hyper-parameters
+
We also get estimates of the population-level hyper-parameters that govern the distribution of $\beta$ -- $\mu$ and $\sigma$ for the log-normal distribution. These are calculated in the context of the log-transformed parameters so the easiest way to interpret $\mu$ is to back-transform it through exponentiation, but this does not so easily transfer to $\sigma$. The CIs in this output are posterior credible intervals taken as the central 95% quantiles of the posterior samples.
+
```{r}
#Mean of normal distribution
trout_constant_estimates$population_data$mean[1] #Raw value
@@ -206,3 +348,6 @@ print(paste0("95% CI for log growth standard deviation: (",
trout_constant_estimates$population_data$CI_upper[2], ")")) #Raw CI
```
From the species-level data we can say that the average annual growth rate for the species is estimated to be 2.4cm/yr, with a 95% posterior CI of (1.83, 3.04). As we fit a constant growth model there's only so much we can say about the growth behaviour.
+
+## References
+
diff --git a/vignettes/here_be_dragons.Rmd b/vignettes/here_be_dragons.Rmd
new file mode 100644
index 0000000..2d03897
--- /dev/null
+++ b/vignettes/here_be_dragons.Rmd
@@ -0,0 +1,603 @@
+---
+title: "Here be dragons"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Here be dragons}
+ %\VignetteEncoding{UTF-8}
+ %\VignetteEngine{knitr::rmarkdown}
+editor_options:
+ chunk_output_type: console
+bibliography: vignette.bib
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>",
+ eval = TRUE
+)
+# job::job({
+# knitr::knit("vignettes/here-be-dragons.Rmd.orig", "vignettes/here-be-dragons.Rmd")
+# })
+
+options(rmarkdown.html_vignette.check_title = FALSE)
+```
+
+## Load dependencies
+```{r}
+#remotes::install_github("traitecoevo/hmde")
+
+library(hmde)
+library(dplyr)
+library(ggplot2)
+library(deSolve)
+library(mixtools)
+library(MASS)
+```
+
+This vignette demonstrates an interaction between errors from numerical integration methods and MCMC sampling that produces a bimodal posterior distribution as a result of numerical error. We stumbled across the problem and are documenting it here, but account for it in the package through either using analytic solutions, or numerical methods with adaptive step sizes where analytic solutions are not available.
+
+The underlying issue is that if, given errors in the chosen numerical integration method, two sets of parameters for a differential equation $f$ give "the same" (more in a second) output for
+$$Y(t_{j+1}) = Y(t_j) + \int_{t_j}^{t_{j+1}} f(Y(t), \boldsymbol{\theta})\,dt\qquad (1)$$
+the MCMC sampler will be unable to meaningfully distinguish the parameter combinations. What you see in practice is chains converging to extremely different parameter combinations, one of which is the `true' combination, the other of which is wrong but produces the same $\hat{Y}(t_j)$ values due to the numerical error. Thus, there is a form of non-identifiability that arises from numerical errors in a longitudinal model based on Equation (1) that are separate to other issues of non-identifiability, and currently under-explored in the literature. In this demonstration we use a Runge-Kutta 4th order numerical method (\cite{butcher2016numerical}) with different step sizes to show that even 'small' step sizes can give problems.
+
+It is important to state that "the same" $Y(t)$ values in this context means within statistical error of each other. We assume that our data consists of observations of the form $y_{j}$ at time $t_j$ that look like
+$$y_j = Y(t_j) + \text{error},$$
+and have some finite level of precision. The numerical method may produce estimated values $\hat{Y}(t_j)$ that differ by some amount that is much smaller than the level of precision or observation error for the different parameter combinations, but due to the imprecision of the measurement process the MCMC sampler cannot meaningfully distinguish the estimates.
+
+For this demonstration we will simulate data as though it is measured in centimetres. We use rounding to produce simulated data with measurement precision of 0.1cm, and error of $\mathcal{N}(0, 0.1)$, analogous to the 1mm measurement precision and approximate standard deviation of the real-world source data used in @obrien2024allindividuals.
+
+## The model
+We are implementing a longitudinal model of the form in Equation (1) within a hierarchical Bayesian longitudinal model where
+$$f(Y(t), \beta_0, \beta_1) = \beta_0 - \beta_1 Y(t)\qquad (2)$$
+Equation (2) is known to produce pathological behaviour from numerical methods [@butcher2016numerical], so serves as an ideal simple example of the interaction between those pathologies and the MCMC sampling process. For the purpose of estimation we shift Equation (2) by the mean observed size which gives
+$$f_{fit}(Y(t), \beta_c, \beta_1) = \beta_c - \beta_1(Y(t) - \bar{y}),$$
+then the back-transformation
+$$\beta_0 = \beta_c + \beta_1 \bar{y}.$$
+
+We are attempting to estimate the parameters $\beta_0$ and $\beta_1$ from observations $y_j$, which is based on estimating $\hat{Y}_j$ given the prior distribution
+$$y_j \sim \mathcal{N}(\hat{Y}_j, 0.1).$$
+
+As we are looking at a single individual, we have prior distributions for the default parameters which are
+$$\beta_1, \beta_c \sim \log\mathcal{N}(0,2),$$
+and enforce $\beta_k > 0$.
+
+## Simulating data
+For this demo code we use $\beta_0 = 10$ and $\beta_1 = 1$, which gives an asymptotic size of 10. If you wish to experiment with other values, input them in the next block and the rest will run based on that. We use the analytic solution to simulate true sizes over time, then add measurement error and round to the chosen measurement precision of 0.1cm to give a sequence of observations over time that become the `y_obs` data for the model fit. Notice that `y_obs` can produces values that are bigger than the theoretical asymptotic size $\beta_0/\beta_1$ due to error.
+```{r}
+#Change these values to change the model parameters. Must be positive values.
+beta_0 <- 10
+beta_1 <- 1
+
+#True initial condition
+true_y_0 <- 1
+max_time <- 9
+time <- 0:max_time
+
+#Analytic solution
+analytic_solution <- function(x = NULL, pars = NULL){ #Pars is list of beta_0, beta_1, y_0
+ return(
+ (pars[[1]]/pars[[2]]) + (pars[[3]] - (pars[[1]]/pars[[2]])) * exp(-pars[[2]] * x)
+ )
+}
+true_pars <- list(
+ beta_0 = beta_0,
+ beta_1 = beta_1,
+ true_y_0 = true_y_0
+)
+true_args_list <- list(pars = c(beta_0,
+ beta_1,
+ true_y_0))
+
+y_true <- analytic_solution(time, true_pars)
+```
+
+From the analytic solution we produce observations by adding measurement error and rounding to a precision of 0.1.
+```{r}
+#Produce observations with error and limited precision
+y_obs <- round(y_true + rnorm(length(y_true), 0, 0.1), digits = 1)
+
+#Unrounded data if needed
+#y_obs <- y_true + rnorm(length(y_true), 0, 0.1)
+
+#Observed data frame
+obs_data_frame <- tibble(
+ time = time,
+ y_obs = y_obs,
+ obs_index = 1:length(y_obs)
+)
+
+#Have a look at the true and 'observed' data
+plot_data <- tibble(
+ x = c(time, time),
+ y = c(y_true,y_obs),
+ Data = c(rep("True sizes", times = length(y_true)),
+ rep("Obs. sizes", times = length(y_obs)))
+)
+
+sizes_over_time <- ggplot(plot_data, aes(x = x, y = y, group = Data)) +
+ geom_point(aes(colour = Data, shape = Data), size = 2) +
+ geom_line(aes(colour = Data, linetype = Data), linewidth = 1) +
+ scale_linetype_manual(values = c("dashed", "dotted")) +
+ ylim(0, 10.5) +
+ labs(x = "Time", y = "Y") +
+ theme_classic() +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.7, 0.3))
+sizes_over_time
+
+#Have a look at the observations against the analytic solution
+analytic_observed <- ggplot(obs_data_frame, aes(x = time, y = y_obs)) +
+ geom_function(fun=analytic_solution, args = true_args_list,
+ linewidth = 1, colour = "black") +
+ geom_point(colour = "darkorchid", size = 3) +
+ geom_line(colour = "darkorchid", linewidth = 1,
+ linetype = "dashed") +
+ labs(x = "Time", y = "Y",
+ title = "Data simulation") +
+ ylim(0, 10.5) +
+ theme_classic()
+analytic_observed
+```
+
+A note on error and precision: the same bad behaviour occurs even if you use unrounded data with all the precision R offers and much smaller error. You can test this by uncommenting the line that rounds the data and/or changing the measurement error standard deviation. The chosen values, and rounding, are intended to demonstrate that this is a problem that may occur with realistic precision and error.
+
+## Implementing models and collecting posterior estimates
+In this section we are going to run a sequence of sets of models. The first batch is about checking for the existence of bimodal posterior distributions across different step sizes for the same numerical method. We then use a different numerical method to see if the problem persists. Lastly, we use different means in the parameter priors to see if the posterior can be constrained by such methods.
+
+### Step size data
+We're going to run 100 fits with a step size of 1 using the custom RK4 solver and a single chain each. Each chain is expected to converge to a parameter combination that gives estimated $\hat{Y}(t_j)$ close to the analytic solution, but which combination is converged to is random. We do single chains because each can fall into the numerical error trap, and the easiest way to identify that trap is to extract the estimates afterwards.
+
+Each fit takes a few seconds to run, so allow several minutes for the following block. There are likely to be diagnostic problems but that is part of what we are here to explore so we will be ignoring them. Divergent transitions in particular are to be expected when we have the very different parameter estimates we see. Results are hidden for this block.
+```{r, results='hide'}
+runs <- 100
+step_size = 0.5
+par_est_tibble <- tibble(run = c(),
+ step_size = c(),
+ beta_0 = c(),
+ beta_1 = c())
+
+for(i in 1:runs){
+ #Run the model
+ suppressWarnings(
+ fit <- hmde_model("affine_single_ind") |>
+ hmde_assign_data(n_obs = nrow(obs_data_frame),
+ y_obs = obs_data_frame$y_obs,
+ obs_index = obs_data_frame$obs_index,
+ time = obs_data_frame$time,
+ y_bar = mean(obs_data_frame$y_obs),
+ step_size = step_size,
+ int_method = 1) |> #RK4
+ hmde_run(chains = 1, cores = 1, iter = 2000)
+ )
+
+ #Extract parameter estimates
+ ests <- hmde_extract_estimates(fit = fit,
+ input_measurement_data = obs_data_frame)
+
+ temp <- tibble(
+ run = i,
+ step_size = step_size,
+ beta_0 = ests$individual_data$ind_beta_0_mean,
+ beta_1 = ests$individual_data$ind_beta_1_mean
+ )
+
+ par_est_tibble <- rbind(par_est_tibble, temp)
+}
+```
+
+## Analysis
+We are going to fit a finite mixture model to tell us about the clustering in the posterior distributions. We assume that there are two clusters (you can check with scatter plots), that one is close to the true values and the other some distance away with much larger estimates for both, and use the mean for rows where $\hat{\beta_}_0 > mean(\hat{\beta_}_0)$ as our starting value for the iterative process to avoid singularities. The overall mean of the estimates works as a threshold for extreme values because of the bimodality and distance between clusters.
+
+```{r}
+step_size_mix_models <- list()
+step_size_mix_model_plots <- list()
+step_size_mix_models_par_ests <- tibble(
+ good_beta_0 = c(),
+ good_beta_1 = c(),
+ error_beta_0 = c(),
+ error_beta_1 = c(),
+ step_size = c(),
+ error_fraction = c(),
+ dist = c()
+)
+
+for(i in 1:length(unique(par_est_tibble$step_size))){
+ #Get data for single step size
+ step_size_selected <- unique(par_est_tibble$step_size)[i]
+
+ analysis_data <- par_est_tibble %>%
+ filter(step_size == step_size_selected)
+
+ #Get some extreme estimates
+ possible_error <- analysis_data %>%
+ filter(beta_0 > mean(analysis_data$beta_0))
+
+ #To speed up the iterative algorithm we provide some initial conditions
+ mu <- list( #Means from true parameters and extreme estimates
+ true = c(beta_0, beta_1),
+ error = c(mean(possible_error$beta_0),
+ mean(possible_error$beta_1))
+ )
+
+ #Fit multivariate normal finite mixture model to the estimates
+ step_size_mix_models[[i]] <- mvnormalmixEM(x = analysis_data[,c(3,4)], mu = mu)
+
+ print(paste0("Summary of mixture model for step size ", step_size_selected))
+ print(summary(step_size_mix_models[[i]]))
+
+ step_size_mix_model_plots[[i]] <- plot(step_size_mix_models[[i]],
+ whichplots = 2,
+ xlab2 = "Beta 0",
+ ylab2 = "Beta 1")
+
+ dist_table <- tibble( #Data to calculate distance
+ b_0 = c(step_size_mix_models[[i]]$mu[[2]][1],
+ step_size_mix_models[[i]]$mu[[1]][1]),
+ b_1 = c(step_size_mix_models[[i]]$mu[[2]][2],
+ step_size_mix_models[[i]]$mu[[1]][2])
+ )
+
+ #Extract values
+ step_size_mix_models_par_ests_temp <- tibble(
+ good_beta_0 = step_size_mix_models[[i]]$mu[[1]][1],
+ good_beta_1 = step_size_mix_models[[i]]$mu[[1]][2],
+ error_beta_0 = step_size_mix_models[[i]]$mu[[2]][1],
+ error_beta_1 = step_size_mix_models[[i]]$mu[[2]][2],
+ step_size = step_size_selected,
+ error_prob = step_size_mix_models[[i]]$lambda[2],
+ dist = dist(dist_table)
+ )
+
+ step_size_mix_models_par_ests <- rbind(step_size_mix_models_par_ests,
+ step_size_mix_models_par_ests_temp)
+}
+
+#Have a look at the estimates
+step_size_mix_models_par_ests
+```
+We get bimodality in the posterior distributions. If you look at multiple step sizes you will see that smaller step sizes push the second cluster further away. Bias in the estimates closest to the true values is due to the same measurement error in the 'observed' data for all the fits. The second mode in the estimates arises from the numerical integration error as we will verify shortly. The extreme bimodality of the posterior is consistent behaviour, even though the point of the second mode shifts based on the step size.
+
+Some aesthetics before plots.
+```{r}
+legend_spec <- tibble(
+ step_size_name = c("0.5", "0.25", "0.125"),
+ step_size = c(0.5, 0.25, 0.125),
+ x = c(-10, -10, -10),
+ y = c(-10, -10, -10),
+ colours = c("#f8766d", "#00ba38", "#609cff"),
+ linetypes = c("longdash", "dashed", "dotted"),
+ shapes = c(19, 17, 15)
+)
+
+legend_spec_with_true <- tibble(
+ step_size_name = c("0.5", "0.25", "0.125", "True pars"),
+ step_size = c(0.5, 0.25, 0.125, NA),
+ x = c(-10, -10, -10, -10),
+ y = c(-10, -10, -10, -10),
+ colours = c("#f8766d", "#00ba38", "#609cff", "black"),
+ linetypes = c("longdash", "dashed", "dotted", "solid"),
+ shapes = c(19, 17, 15, 3)
+)
+
+for(i in 1:nrow(legend_spec)){
+ fancy_name_no_step_size <-
+ paste0("Beta_0 = ",
+ signif(step_size_mix_models_par_ests$error_beta_0[i],
+ digits = 3),
+ ",\n Beta_1 = ",
+ signif(step_size_mix_models_par_ests$error_beta_1[i],
+ digits = 3))
+ legend_spec$fancy_name_no_step_size[i] <- fancy_name_no_step_size
+ legend_spec_with_true$fancy_name_no_step_size[i] <- fancy_name_no_step_size
+
+ fancy_name <- paste0("Step size ", step_size_mix_models_par_ests$step_size[i],
+ "\n", fancy_name_no_step_size)
+
+ legend_spec$fancy_name[i] <- fancy_name
+ legend_spec_with_true$fancy_name[i] <- fancy_name
+}
+
+legend_spec_with_true$fancy_name_no_step_size[4] <-
+ paste0("Beta_0 = ",
+ beta_0,
+ ",\n Beta_1 = ",
+ beta_1)
+
+legend_spec_with_true$fancy_name[4] <-
+ paste0("True values\n Beta_0 = ",
+ beta_0,
+ ",\n Beta_1 = ",
+ beta_1)
+```
+
+We use scatter plots of the clusters for qualitative analysis. As the clusters are so distant from each other, and so tight around their means, we separate them out for plots. The mixture models gives a classification of each point in the data that we use to filter observations. As the clusters are so distant we can use other heuristics such as $\hat{\beta_}_0 > 2\beta_0$ which agree perfectly with the cluster analysis.
+
+The contours over the scatter plot come from data simulated from the cluster's multivariate normal distribution identified by the finite mixture model.
+```{r}
+scatterplot_errors_only <- list()
+scatterplot_good_only <- list()
+
+for(i in 1:length(unique(par_est_tibble$step_size))){
+ step_size_select <- unique(par_est_tibble$step_size)[i]
+
+ plot_data <- par_est_tibble %>%
+ filter(step_size == step_size_select)
+
+ #Get classification from mixture model
+ plot_data$good_est <- step_size_mix_models[[i]][["posterior"]][,1]
+
+ error_ests_scatter <- plot_data %>%
+ filter(!as.logical(good_est))
+ good_ests_scatter <- plot_data %>%
+ filter(as.logical(good_est))
+
+ #Scatter plot of erroneous parameters
+ xpos <- (min(error_ests_scatter$beta_0) +
+ 0.2*(max(error_ests_scatter$beta_0) -
+ min(error_ests_scatter$beta_0)))
+ ypos <- (max(error_ests_scatter$beta_1) -
+ 0.1*(max(error_ests_scatter$beta_1) -
+ min(error_ests_scatter$beta_1)))
+ norm_data <- as.data.frame(mvrnorm(n = 10000,
+ mu = step_size_mix_models[[i]][["mu"]][[2]],
+ Sigma = step_size_mix_models[[i]][["sigma"]][[2]]))
+ names(norm_data) <- c("beta_0", "beta_1")
+
+ scatterplot_errors_only[[i]] <- ggplot(data = error_ests_scatter,
+ aes(x = beta_0, y = beta_1)) +
+ geom_point(colour = legend_spec$colours[i],
+ shape = legend_spec$shapes[i],
+ alpha = 0.5,
+ size = 2) +
+ geom_density_2d(data = norm_data, colour = "black") +
+ labs(x = "beta_0 est.",
+ y = "beta_1 est.",
+ title = "Second cluster") +
+ annotate("text", x = xpos, y = ypos,
+ label = paste0("Probability: \n",
+ step_size_mix_models_par_ests$error_prob[i])) +
+ theme_classic()
+
+ #Scatter plot of good parameter estimates
+ xpos <- (min(good_ests_scatter$beta_0) +
+ 0.2*(max(good_ests_scatter$beta_0) -
+ min(good_ests_scatter$beta_0)))
+ ypos <- (max(good_ests_scatter$beta_1) -
+ 0.1*(max(good_ests_scatter$beta_1) -
+ min(good_ests_scatter$beta_1)))
+ norm_data <- as.data.frame(mvrnorm(n = 10000,
+ mu = step_size_mix_models[[i]][["mu"]][[1]],
+ Sigma = step_size_mix_models[[i]][["sigma"]][[1]]))
+ names(norm_data) <- c("beta_0", "beta_1")
+
+ scatterplot_good_only[[i]] <- ggplot(data = good_ests_scatter,
+ aes(x = beta_0, y = beta_1)) +
+ geom_point(colour = legend_spec$colours[i],
+ shape = legend_spec$shapes[i],
+ alpha = 0.5,
+ size = 2) +
+ geom_density_2d(data = norm_data, colour = "black") +
+ labs(x = "beta_0 est.",
+ y = "beta_1 est.",
+ title = "First cluster") +
+ annotate("text", x = xpos, y = ypos,
+ label = paste0("Probability: \n",
+ (1-step_size_mix_models_par_ests$error_prob[i]))) +
+ theme_classic()
+}
+```
+
+We can double-check the numerical error by using an independent solver with the same step size. We use the `deSolve` package which has an implementation of RK4 and allows us to choose the step sizes using the time parameter.
+```{r}
+#install.packages("deSolve")
+library(deSolve)
+
+#Create DE function
+DE <- function(Time, State, Pars) { #Implementation of DE
+ with(as.list(c(State, Pars)), {
+ dY <- beta_0 - beta_1 * Y
+
+ return(list(c(dY)))
+ })
+}
+```
+
+### Second cluster analysis
+We want to look at the behaviour of the numerical method for the bad estimate clusters. To do so we project forward from the initial condition using the chosen step size, bad parameter combination, and see what happens. We can compare the numerical solution to both the true sizes over time, and to the analytic solution with those same bad parameter estimates.
+
+First we generate the numerical and analytic solution data.
+```{r}
+yini <- c(Y = true_y_0) #Initial condition
+y_over_time <- tibble(model="True Sizes",
+ y_analytic = y_true,
+ y_numeric = y_true,
+ time = 0:max_time,
+ beta_0_par = beta_0,
+ beta_1_par = beta_1
+ )
+
+#Generate Y(t) with RK4
+for(i in 1:nrow(step_size_mix_models_par_ests)){
+ pars_combo <- c(beta_0 = step_size_mix_models_par_ests$error_beta_0[i],
+ beta_1 = step_size_mix_models_par_ests$error_beta_1[i])
+ times <- seq(0, max_time, by = step_size_mix_models_par_ests$step_size[i])
+
+ solution_pars <- c(pars_combo, true_y_0)
+ y_true_temp <- analytic_solution(times, solution_pars)
+
+ numerical_output <- ode(yini, times, DE, pars_combo, method = "rk4")
+
+ y_over_time_temp <- tibble(
+ model = step_size_mix_models_par_ests$step_size[i],
+ y_analytic = y_true_temp,
+ y_numeric = numerical_output[,2],
+ time = times,
+ beta_0_par = step_size_mix_models_par_ests$error_beta_0[i],
+ beta_1_par = step_size_mix_models_par_ests$error_beta_1[i]
+ )
+
+ y_over_time <- rbind(y_over_time, y_over_time_temp)
+}
+```
+
+Here is a figure that shows all of the estimated sizes over time for the bad parameter combinations across different step sizes, compared to the true values.
+```{r}
+y_over_time_filtered <- y_over_time %>%
+ filter(time %in% 0:max_time)
+
+#Plot sizes over time for all models
+compare_sizes_over_time <- ggplot(y_over_time_filtered,
+ aes(x=time, y=y_numeric, group_by = as.factor(model))) +
+ geom_point(aes(colour = as.factor(model),
+ shape = as.factor(model)),
+ alpha=0.5, size = 2, stroke = 1.5) +
+ geom_line(aes(colour = as.factor(model)), alpha=0.5, linewidth = 1) +
+ scale_colour_manual(values = legend_spec_with_true$colours) +
+ scale_shape_manual(values = legend_spec_with_true$shapes) +
+ labs(x = "Time", y = "Y(t)", title = "Estimated Y(t) with bad parameters",
+ colour = "Step size", shape = "Step size") +
+ theme_classic() +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.7, 0.3))
+
+compare_sizes_over_time
+```
+These are indistinguishable values. If you look extremely closely you get some deviation due to parameter bias, but that is not statistically relevant to the process.
+
+To demonstrate that a smaller step size to test the method is enough to identify bad estimates, we show that RK4 with step size 0.001 diverges from the true sizes over time. The lines in this plot are based on the small step size numerical estimates, while the points come from the $Y(t_j)$ values for the discrete observation times.
+```{r}
+#Generate y(t) with RK4 given the parameter estimates
+y_over_time_smallstep <- tibble(model=legend_spec_with_true$fancy_name[4],
+ y_hat = y_true,
+ time = 0:max_time
+ )
+
+for(i in 1:nrow(step_size_mix_models_par_ests)){
+ pars_combo <- c(beta_0 = step_size_mix_models_par_ests$error_beta_0[i],
+ beta_1 = step_size_mix_models_par_ests$error_beta_1[i])
+ times <- seq(0, max_time, by = 0.001)
+
+ numerical_output <- ode(yini, times, DE, pars_combo, method = "rk4")
+
+ y_over_time_temp <- tibble(
+ model = legend_spec_with_true$fancy_name_no_step_size[i],
+ y_hat = numerical_output[,2],
+ time = times
+ )
+
+ y_over_time_smallstep <- rbind(y_over_time_smallstep, y_over_time_temp)
+}
+
+point_data <- y_over_time_smallstep %>%
+ filter(time %in% 0:max_time)
+
+#Plot sizes over time
+compare_sizes_over_time_smallstep <- ggplot(y_over_time_smallstep,
+ aes(x=time, y=y_hat, grouping = as.factor(model))) +
+ geom_line(aes(colour = as.factor(model),
+ linetype = as.factor(model)), alpha=0.5, linewidth = 1) +
+ geom_point(data = point_data,
+ aes(colour = as.factor(model),
+ shape = as.factor(model)),
+ alpha=0.5, size = 2, stroke = 1.5) +
+ geom_function(fun=analytic_solution, args=true_args_list,
+ colour="black",
+ linetype = "solid",
+ linewidth=1) +
+ scale_shape_manual(values = legend_spec_with_true$shapes) +
+ scale_colour_manual(values = legend_spec_with_true$colours) +
+ scale_linetype_manual(values = c(legend_spec$linetypes, NA)) +
+ labs(x = "Time", y = "Y(t)", title = "Small step size",
+ colour = "Parameters",
+ shape = "Parameters",
+ linetype = "Parameters") +
+ theme_classic() +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.7, 0.4),
+ legend.key.spacing.y = unit(2, 'mm')) +
+ guides(colour = guide_legend(byrow = TRUE))
+
+compare_sizes_over_time_smallstep
+```
+
+Here are two plots showing the ODEs and analytic solutions with the bad estimates compared to the true parameter values. To plot the ODEs we exploit the fact that they are straight lines rather than plotting the functions properly.
+```{r}
+#Get asymptotic size
+step_size_mix_models_par_ests$Y_max <- step_size_mix_models_par_ests$error_beta_0/step_size_mix_models_par_ests$error_beta_1
+
+#Build points for start and end of lines
+plot_data <- tibble(
+ x = c(0, (beta_0/beta_1),
+ rep(0, times = nrow(step_size_mix_models_par_ests)),
+ step_size_mix_models_par_ests$Y_max),
+ y = c(beta_0, 0,
+ step_size_mix_models_par_ests$error_beta_0,
+ rep(0, times = nrow(step_size_mix_models_par_ests))),
+ step_size = c("True pars", "True pars",
+ step_size_mix_models_par_ests$step_size,
+ step_size_mix_models_par_ests$step_size)
+)
+
+#Plot DEs
+error_de_plot <- ggplot(data = plot_data, aes(x,y)) +
+ geom_line(aes(colour = as.factor(step_size),
+ linetype = as.factor(step_size)),
+ linewidth = 1) +
+ scale_colour_manual(values = c(legend_spec$colours[3:1], "black")) +
+ scale_linetype_manual(values = c(legend_spec$linetypes[3:1], "solid")) +
+ labs(title = "ODEs",
+ x = "Y(t)", y = "f",
+ colour = "Step size",
+ linetype = "Step size") +
+ theme_classic() +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.7, 0.7))
+error_de_plot
+
+#Plot analytic solutions
+error_solution_plot <- ggplot() +
+ geom_function(fun=analytic_solution, args=true_args_list,
+ colour="black",
+ linetype = "solid",
+ linewidth=1)
+
+for(i in 1:nrow(step_size_mix_models_par_ests)){ #Add the analytic solutions
+ args_list <- list(pars=c(step_size_mix_models_par_ests$error_beta_0[i],
+ step_size_mix_models_par_ests$error_beta_1[i],
+ true_y_0))
+ error_solution_plot <- error_solution_plot +
+ geom_function(fun=analytic_solution, args=args_list,
+ colour=legend_spec$colours[i],
+ linetype = legend_spec$linetypes[i],
+ linewidth=1)
+}
+
+error_solution_plot <- error_solution_plot +
+ geom_line(data = legend_spec_with_true,
+ linewidth=1,
+ aes(colour = fancy_name_no_step_size,
+ linetype = fancy_name_no_step_size,
+ x = x, y = y)) +
+ scale_colour_manual(values = c(legend_spec_with_true$colours[4],
+ legend_spec$colours[c(2,3,1)])) +
+ scale_linetype_manual(values = c(legend_spec_with_true$linetypes[4],
+ legend_spec$linetypes[c(2,3,1)])) +
+ xlim(0, max_time) +
+ ylim(true_y_0, (beta_0/beta_1+0.5)) +
+ labs(x = "Time", y = "Y(t)",
+ title = "Analytic solutions",
+ colour = "Parameters",
+ linetype = "Parameters") +
+ theme_classic() +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.7, 0.4),
+ legend.key.spacing.y = unit(2, 'mm')) +
+ guides(colour = guide_legend(byrow = TRUE))
+
+error_solution_plot
+```
+The limiting behaviour at $\beta_0/\beta_1$ is consistent, what changes is how fast line approaches the asymptote.
+
+# Where to from here?
+For the purpose of the hmde package that this vignette is a part of, we account for the pathologies in this particular model by using the analytic solution for the von Bertalanffy equation. More work needs to be done to understand the interaction between numerical methods and MCMC sampling. What we have demonstrated is that the problem exists, and it is not enough to have numerical stability at the true parameter values because MCMC estimation moves around, you need numerical stability in a potentially quite large part of the parameter space. The good news is that simulated data and posterior plots with more accurate numerical methods can at least identify that something is going wrong.
diff --git a/vignettes/hmde.Rmd b/vignettes/hmde.Rmd
index 9035094..fcd9777 100644
--- a/vignettes/hmde.Rmd
+++ b/vignettes/hmde.Rmd
@@ -7,6 +7,7 @@ vignette: >
%\VignetteEngine{knitr::rmarkdown}
editor_options:
chunk_output_type: console
+bibliography: vignette.bib
---
```{r, include = FALSE}
@@ -18,10 +19,10 @@ knitr::opts_chunk$set(
library(hmde)
```
-'hmde' which stands for **h**ierarchical **m**ethods for **d**ifferential **e**quations is a package built for biologists with repeat size measurement data who want to fit a specific set of growth functions.
+`hmde` which stands for **h**ierarchical **m**ethods for **d**ifferential **e**quations is a package built for biologists with repeat size measurement data who want to fit a specific set of growth functions.
## Installation
-'hmde' is under active development, you can install the stable, development version of 'hmde' from [GitHub](https://github.com/) with:
+`hmde` is under active development, you can install the stable, development version of `hmde` from [GitHub](https://github.com/) with:
```{r, eval=FALSE}
# install.packages("remotes")
@@ -43,13 +44,13 @@ Y_i(t_{j+1}) = Y_i(t_j) + \int_{t_j}^{t_{j+1}} f(Y(t), \beta_i)\,dt
\end{equation}
where the integral adds up all the growth over the intervening time. Each model we use will comes with its specific growth parameters that we will describe. Some are more biologically interpretable than others. We don't assume that we see the true sizes, and instead have observed size
$$y_{ij} = Y_i(t_j) + \text{ error}.$$
-We have assumed normally distributed error in `hmde`,this has proven reasonably robust in simulation for a more general size-dependent error model. For details see \cite{OBrien2024_Methods}.
+We have assumed normally distributed error in `hmde`,this has proven reasonably robust in simulation for a more general size-dependent error model. For details see @obrien2024allindividuals.
Due to the hierarchical structure of the statistical model, we have distributions that govern the behaviour of growth parameters. If we are modelling only a single individual, we don't worry about the underlying distribution so much. If we have multiple individuals then we have a distribution with hyper-parameters that acts as a population-level feature, so
$$\beta_i \sim \log\mathcal{N}(\mu, \sigma)$$
for example, and we can examine the behaviour of the mean and standard deviation as population-level features.
-## {hmde} supported growth functions
+## `hmde` supported growth functions
- Constant growth
- Von Bertalanffy
@@ -59,11 +60,11 @@ For each growth function, we have implementation to model the growth of a single
## Workflow
-Broadly, the workflow for `hdme` is to:
+Broadly, the workflow for `hmde` is to:
1. Wrangle the data into the required format for a chosen model,
2. Pass the data and the model to a function that runs the sampling with Stan's MCMC algorithms.
-3. Inspect and anaysis the returned `Stan` fit object.
+3. Inspect and analysis the returned `Stan` fit object.
We will demonstrate this workflow using case studies that uses the three growth functions that are supported in `hmde`. You can find these on our website or you can view these in R using:
```{r, eval=FALSE}
@@ -72,5 +73,7 @@ vignettes("von-bertalanffy")
vignettes("canham")
```
-For each case study, we will discuss why that growth function was chosen in the context of the survey process as data availability is a key factor in determining which functions can be used. We will not discuss the mathematical and statistical theory in depth, if that is of interest, check out the vignette 'hmde for Mathematicians' or check out the methodology paper: \cite{obrien2024allindividuals}.
+For each case study, we will discuss why that growth function was chosen in the context of the survey process as data availability is a key factor in determining which functions can be used. We will not discuss the mathematical and statistical theory in depth, if that is of interest, check out the vignette 'hmde for Mathematicians' or check out the methodology paper @obrien2024allindividuals.
+
+## References
diff --git a/vignettes/hmde_for_mathematicians.Rmd b/vignettes/hmde_for_mathematicians.Rmd
new file mode 100644
index 0000000..069f4e3
--- /dev/null
+++ b/vignettes/hmde_for_mathematicians.Rmd
@@ -0,0 +1,174 @@
+---
+title: "hmde for Mathematicians"
+output: rmarkdown::html_vignette
+bibliography: vignette.bib
+vignette: >
+ %\VignetteIndexEntry{hmde for Mathematicians}
+ %\VignetteEncoding{UTF-8}
+ %\VignetteEngine{knitr::rmarkdown}
+editor_options:
+ chunk_output_type: console
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE,
+ comment = "#>"
+)
+```
+
+This vignette is intended for a statistical/mathematical audience who are interested in Bayesian inverse problems. For biologists looking for an applications-based walkthrough the other vignettes in this package -- `hmde`, `constant-growth`, `von-bertalanffy`, and `canham` -- are less theoretical.
+
+# Getting started with {hmde}
+'hmde' is under active development, you can install the development version of 'hmde' from [GitHub](https://github.com/) with:
+
+```{r, eval=FALSE}
+# install.packages("remotes")
+remotes::install_github("traitecoevo/hmde")
+```
+
+```{r}
+library(dplyr)
+library(ggplot2)
+library(hmde)
+```
+
+# The Theory
+The underlying method that the hmde package implements leverages the longitudinal structure of repeat measurement data to estimate parameters for an underlying differential equation, as first demonstrated in \cite{obrien2024allindividuals}. This method is an example of a Bayesian inverse method for differential equation estimation: we are attempting to estimate the parameters of a chosen DE based on observations of the resulting process. We assume that the data consists of a finite number of discrete (likely sparse) observations with measurement error at the individual level, and have a set number of possible differential equations ready to be fit to data.
+
+## The Maths
+In a general setting, we are interested in some quantity $Y(t)$ that changes over time (approximately) according to a chosen DE $f$. We have some finite finite number measurements at times $t_j$, and believe that the underlying true behaviour is given by
+$$
+Y(t_{j+1}) = Y(t_j) + \int_{t_j}^{t_{j+1}} f(Y(t), \boldsymbol{\theta})\,dt\qquad (1)
+$$
+for unseen parameter vector $\boldsymbol{\theta}$ that we wish to estimate. We also have an initial condition $Y_0 = Y(t_0)$.
+
+We have three in-built growth functions in `hmde`.
+
+- Constant: $f = \beta$ chosen as when you only have two observations the average growth rate is the best you can do. Furthermore, the results from the constant model align with a linear mixed effects model for size with individual parameter. The constant model is size-independent. As all numerical methods are equivalent and the same as the analytic solution given the initial condition, we use the Euler method for the constant model.
+
+- von Bertalanffy:
+$$f = \beta (Y_{max} - Y(t))\qquad\qquad(2)$$
+which has a history of use in biology for species that grow to some maximum size, and represents a simple size-dependent linear function. If a power law model is desired, log-transforming observations and then back-transforming by exponentiation gives such a function. Equation (2) is implemented with the analytic solution rather than a numerical method, as it is a known nightmare example for numerical stability due to the negative coefficient on $Y(t)$.
+
+- Canham:
+$$
+f = f_{max} \exp\Bigg(-\frac{1}{2}\bigg(\frac{\log(Y(t)/Y_{max})}{k} \bigg)^2 \Bigg),\qquad\qquad(3)
+$$
+which is considered a reasonable approximation of long term growth behaviour for some tree species as shown in \cite{obrien2024allindividuals} and [Chapter 2]. Equation (3) is extremely non-linear and does not have an analytic solution, forcing the use of numerical methods in order to estimate the growth increments in Equation (1).
+
+- Affine:
+$$f = \beta_0 - \beta_1 Y(t)$$
+which is only included for demonstration purposes of where numerical methods can go wrong as it is a re-parameterisation of the von Bertalanffy model.
+
+Choice of appropriate function is an exercise for the user and depends on the available data. Aside from the affine model all have versions that work with both a single individual, and multiple individuals. We provide example data intended for use with each of the primary models:
+- `Trout_Size_Data` for the constant model,
+- `Lizard_Size_Data` for the von Bertalanffy model,
+- `Tree_Size_Data` for the Canham model.
+
+## The Stats
+We assume that we do not have access to $\boldsymbol{\theta}$ or the true values of $Y$ over time. Instead we have observations with measurement error,
+$$
+y_j = Y(t_j) + \text{error},
+$$
+and estimate $\boldsymbol{\theta}$ with $\hat{\boldsymbol{\theta}}$.
+
+We use a hierarchical structure to encode different levels of relationships within the data. At the bottom of the hierarchy is the measurement level
+$$
+y_j \sim \mathcal{N}(\hat{Y}(t_j), \sigma_e),
+$$
+where we assume normally distributed error. This may not always be true, but \ref{obrien2024allindividuals} indicated that symmetric error centred at 0 may be enough for reasonable results. The longitudinal structure in Equation (1) serves as the next level, connecting estimated sizes over time based on the chosen function $f$ and estimated parameters $\hat{\boldsymbol{\theta}}$, which operates at the level of the individual.
+
+If the data has multiple individuals we add additional layers that act as hyper-parameters on the distributions of elements of each individual $i$'s $\hat{\boldsymbol{\theta}}_i$. We build these to be independent $\theta_{ki}$s, with log-mean and log-standard deviation parameters
+$$
+\theta_k \sim \log\mathcal{N}(\mu_k, \sigma_k),
+$$
+and typically use the following priors:
+$$
+\mu_k \sim\mathcal{N}(0,2), \quad 0 < \sigma_k\sim Cauchy(0, 2).
+$$
+For details on the prior distributions see the vignette for a specific model or check the Stan file. To see the default values run `hmde_model` on the model name as a string and look at the `prior_pars` argument for the parameter name:
+```{r}
+hmde_model("canham_multi_ind")
+```
+
+The error parameter $\sigma_e >0$ is assumed to operate at a global level independent of individual and typically has a Cauchy prior with location 0, spread parameter 2.
+
+Estimation is done using MCMC through Stan.
+
+## Integration of time series
+Numerical methods are required for the Canham model as it has no analytic solution, and the inbuilt Stan Runge-Kutta 4-5 solver is used.
+
+For the von Bertalanffy model an analytic solution is used in order to avoid numerical problems. For the constant model all numerical methods are the same and give the same result as the analytic solution so Euler is used.
+
+
+# Demonstration: Canham Growth - Multiple Individuals
+The provided tree data for 50 individuals takes a few hours to run. As such, the following block does not run by default, and instead we leverage the provided estimates data file: `Tree_Size_Ests`. The estimates are posterior mean, median, and 95\% central credible intervals for parameters, and mean posterior estimates of sizes over time based on the longitudinal model.
+```{r, eval=FALSE}
+# Build fit and extract estimates
+canham_multi_ind_fit <- hmde_model("canham_multi_ind") |>
+ hmde_assign_data(data = Tree_Size_Data) |>
+ hmde_run(chains = 1, cores = 1, iter = 1000)
+
+Tree_Size_Ests <- hmde_extract_estimates(fit = canham_multi_ind_fit,
+ input_measurement_data = Tree_Size_Data)
+```
+
+The following code produces plots of the Canham function for each individual between the first and last estimated size. The purpose of the plot is to look at how different individual growth functions behave.
+```{r}
+summary(Tree_Size_Ests)
+
+# Plot fitted growth function pieces
+plot_par_individual_data <- Tree_Size_Ests$individual_data[,c(1, 2, 6, 10)] #Pull out estimates only
+hmde_plot_de_pieces(model = "canham_multi_ind",
+ individual_data = plot_par_individual_data,
+ measurement_data = Tree_Size_Ests$measurement_data)
+```
+Each line represents 25 years of growth for the specific individual. Lines that sit lower on the $y$-axis are shorter horizontally because they are traversed more slowly, as $f$ is the rate of change in $Y$.
+
+To understand the error model, the following code plots a number of individual sizes over time, then super-imposes those growth curves in black on the individual growth function plots. To view a specific individual, use the `ind_id` value to select them.
+```{r}
+#Plots of size over time for a sample of 5 individuals
+sample_ids <- sample(1:nrow(Tree_Size_Ests$individual_data), size=5) %>%
+ sort()
+plot_data <- Tree_Size_Ests$measurement_data %>%
+ filter(ind_id %in% sample_ids)
+
+ind_size_lims <- Tree_Size_Ests$measurement_data %>%
+ filter(ind_id %in% sample_ids)%>%
+ group_by(ind_id) %>%
+ summarise(y_0 = min(y_hat),
+ y_final = max(y_hat))
+
+ggplot(data=plot_data, aes(group = ind_id)) +
+ geom_point(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
+ shape = 1) +
+ geom_line(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
+ linetype = "dashed") +
+ geom_point(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
+ shape = 2) +
+ geom_line(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
+ linetype = "solid") +
+ labs(x="Time (years)", y="DBH (cm)", colour="Ind. ID") +
+ theme_classic()
+
+#Load DE for Canham
+DE_function = hmde_model_des("canham_multi_ind")
+
+#Produce plot with focus inds
+function_plot <- hmde_plot_de_pieces(model = "canham_multi_ind",
+ individual_data = plot_par_individual_data,
+ measurement_data = Tree_Size_Ests$measurement_data,
+ alpha = 0.2)
+for(i in 1:length(sample_ids)){
+ args_list <- list(pars=Tree_Size_Ests$individual_data[sample_ids[i],c(2, 6, 10)])
+
+ function_plot <- function_plot +
+ geom_function(fun=DE_function, args=args_list,
+ colour="black", linewidth=1, alpha = 1,
+ xlim=c(ind_size_lims$y_0[i], ind_size_lims$y_final[i]))
+
+}
+
+function_plot
+```
diff --git a/vignettes/vignette.bib b/vignettes/vignette.bib
index cc48c3b..ef487fe 100644
--- a/vignettes/vignette.bib
+++ b/vignettes/vignette.bib
@@ -157,3 +157,4 @@ @article{canham2004neighborhood
year={2004},
publisher={NRC Research Press Ottawa, Canada}
}
+
diff --git a/vignettes/von-bertalanffy.Rmd b/vignettes/von-bertalanffy.Rmd
index adb6a1d..daebe2a 100644
--- a/vignettes/von-bertalanffy.Rmd
+++ b/vignettes/von-bertalanffy.Rmd
@@ -5,17 +5,21 @@ vignette: >
%\VignetteIndexEntry{von-bertalanffy}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
+bibliography: vignette.bib
editor_options:
chunk_output_type: console
---
-
+
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
+options(rmarkdown.html_vignette.check_title = FALSE)
```
+### Load dependencies
+
```{r setup}
# remotes::install_github("traitecoevo/hmde")
# install.packages(c("dplyr", "ggplot2"))
@@ -25,12 +29,36 @@ library(dplyr)
library(ggplot2)
```
+## Overview
Our second demo introduces size-dependent growth based on the von Bertalanffy function
-$$f(Y(t); S_{max}, \beta) = \beta(S_{max} - Y(t)),$$
+\begin{equation}\label{eqn_vb_DE}
+f(Y(t); S_{max}, \beta) = \beta(S_{max} - Y(t)),
+\end{equation}
where $S_{max}$ is the asymptotic maximum size and $\beta$ controls the growth rate. We have implemented the analytic solution
-$$Y(t) = S_{max} + (Y(0) - S_{max}) \exp(-t\beta)$$
-which is independent of age at the starting size $Y(0)$ and instead uses the first size as the initial condition. The key behaviour of the von Bertalanffy model is a high growth rate at small sizes that declines linearly as the size approaches $S_{max}$. This manifests as growth slowing as a creature matures with a hard finite limit on the eventual size. We restrict $\beta$ and $S_{max}$ to be positive, and $S_{max}$ to be larger than the observed sizes. As a result the growth rate is non-negative.
+\begin{equation}\label{eqn_vb_soln}
+Y(t) = S_{max} + (Y(0) - S_{max}) \exp(-t\beta)
+\end{equation}
+which is independent of age at the starting size $Y(0)$ and instead uses the first size as the initial condition. The key behaviour of the von Bertalanffy model is a high growth rate at small sizes that declines linearly as the size approaches $S_{max}$. This manifests as growth slowing as a creature matures with a hard finite limit on the eventual size. We restrict $\beta$ and $S_{max}$ to be positive. As a result the growth rate is non-negative.
+
+### Priors
+The default priors for the constant top-level parameters in the single individual model are
+$$S_{max} \sim \log\mathcal{N}(\log(\max(y_{obs})), 2),$$
+$$\beta \sim \log\mathcal{N}(0, 2),$$
+$$0 <\sigma_e \sim Cauchy(0, 2).$$
+
+For the multi-individual model the prior structure and default parameters are
+$$\mu_{S_{max}} \sim \mathcal{N}(\log(\max(y_{obs})), 2),$$
+$$0 < \sigma_{S_{max}} \sim Cauchy(0, 2),$$
+$$\mu_{\beta} \sim \mathcal{N}(0, 2),$$
+$$0 < \sigma_{\beta} \sim Cauchy(0, 2),$$
+$$0 < \sigma_{e} \sim Cauchy(0, 2).$$
+The max size parameter priors are always centred at the (transformed) maximum observed size. This is not changeable, but the standard deviation is. To see the name for the prior parameter run `hmde_model`. For example in the following we want to change the prior for $S_{max}$ standard deviation (`ind_max_size`) in the individual model:
+```{r}
+hmde_model("vb_single_ind")
+#prior_pars_ind_max_size_sd_only is the argument name for the prior parameter
+```
+### Visualise model
In the following code we plot an example of the growth function and the solution to get a feel for the behaviour.
```{r}
#Analytic solution in function form
@@ -78,10 +106,10 @@ ggplot() +
```
-The von Bertalanffy model is commonly used in fishery management \citep{flinn2021trends}, but has also been used in reptile studies such as \cite{edmonds2021growing, zhao2020age}.
+The von Bertalanffy model is commonly used in fishery management [@flinn2021trends], but has also been used in reptile studies such as @edmonds2021growing and @zhao2020age.
## Lizard size data
-Our data is sourced from \cite{Kar2023_LizardMass} which measured mass and snout-vent-length (SVL) of delicate skinks -- \textit{Lampropholis delicata} -- under experimental conditions to examine the effect of temperature on development. We are going to use the SVL metric for size.
+Our data is sourced from @Kar2023_LizardMass which measured mass and snout-vent-length (SVL) of delicate skinks -- \textit{Lampropholis delicata} -- under experimental conditions to examine the effect of temperature on development. We are going to use the SVL metric for size.
We took a simple random sample without replacement of 50 individuals with at least 5 observations each. The von Bertalanffy model can be fit to shorter observation lengths, but fewer than 3 observations is not advised as there are two growth parameters per individual.
@@ -92,14 +120,13 @@ lizard_vb_fit <- hmde_model("vb_multi_ind") |>
hmde_assign_data(data = Lizard_Size_Data) |>
hmde_run(chains = 4, cores = 1, iter = 2000)
-lizard_vb_estimates <- hmde_extract_estimates(model = "vb_multi_ind",
- fit = lizard_vb_fit,
- input_measurement_data = Lizard_Size_Data)
+lizard_estimates <- hmde_extract_estimates(fit = lizard_vb_fit,
+ input_measurement_data = Lizard_Size_Data)
```
As before, we can compare the observed sizes over time to those predicted by the model.
```{r}
-measurement_data_transformed <- lizard_vb_estimates$measurement_data %>%
+measurement_data_transformed <- lizard_estimates$measurement_data %>%
group_by(ind_id) %>%
mutate(
delta_y_obs = y_obs - lag(y_obs),
@@ -110,6 +137,7 @@ measurement_data_transformed <- lizard_vb_estimates$measurement_data %>%
) %>%
ungroup()
+
#Distributions of estimated growth and size
hist(measurement_data_transformed$y_hat,
main = "Estimated size distribution",
@@ -123,38 +151,50 @@ hist(measurement_data_transformed$est_growth_rate,
#Quantitative R^2
cor(measurement_data_transformed$y_obs, measurement_data_transformed$y_hat)^2
+r_sq_est <- cor(lizard_estimates$measurement_data$y_obs,
+ lizard_estimates$measurement_data$y_hat)^2
+r_sq <- paste0("R^2 = ",
+ signif(r_sq_est,
+ digits = 3))
+
+obs_scatter <- ggplot(data = lizard_estimates$measurement_data,
+ aes(x = y_obs, y = y_hat)) +
+ geom_point(shape = 16, size = 1, colour = "green4") +
+ xlab("Y obs.") +
+ ylab("Y est.") +
+ geom_abline(slope = 1, linetype = "dashed") +
+ annotate("text", x = 25, y = 18,
+ label = r_sq) +
+ theme_classic()
#Plots of size over time for a sample of 5 individuals
-sample_ids <- sample(1:nrow(lizard_vb_estimates$individual_data), size=5)
-plot_data <- measurement_data_transformed %>%
- filter(ind_id %in% sample_ids)
-
-ggplot(data=plot_data, aes(group = ind_id)) +
- geom_point(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- shape = 1) +
- geom_line(aes(x = time, y=y_obs, colour = as.factor(ind_id)),
- linetype = "dashed") +
- geom_point(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- shape = 2) +
- geom_line(aes(x = time, y=y_hat, colour = as.factor(ind_id)),
- linetype = "solid") +
- labs(x="Time (days)", y="Size (mm)", colour="Ind. ID") +
- theme_classic()
+obs_est_ind <- hmde_plot_obs_est_inds(n_ind_to_plot = 5,
+ measurement_data = lizard_estimates$measurement_data) +
+ theme(legend.position = "inside",
+ legend.position.inside = c(0.8, 0.2))
```
We have two parameters at the individual level and are interested in both their separate distributions, and if we see evidence of a relationship between them. We can also use the individual parameter estimates and estimated sizes to plot the growth function pieces.
```{r}
#1-dimensional parameter distributions
-hist(lizard_vb_estimates$individual_data$ind_max_size_mean,
- main = "Individual max size parameters",
- xlab = "Smax estimate")
+s_max_hist <- ggplot(lizard_estimates$individual_data,
+ aes(ind_max_size_mean)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="S_max estimate") +
+ theme_classic()
-hist(lizard_vb_estimates$individual_data$ind_growth_rate_mean,
- main = "Individual growth rate parameters",
- xlab = "beta estimate")
+beta_hist <- ggplot(lizard_estimates$individual_data,
+ aes(ind_growth_rate_mean)) +
+ geom_histogram(bins = 10,
+ colour = "black",
+ fill = "lightblue") +
+ labs(x="beta estimate") +
+ theme_classic()
#2-dimensional parameter distribution
-ggplot(data = lizard_vb_estimates$individual_data,
+par_scatter <- ggplot(data = lizard_estimates$individual_data,
aes(x = ind_max_size_mean, y = ind_growth_rate_mean)) +
geom_point(shape = 16, size = 1, colour = "green4") +
xlab("Individual max sizes (mm)") +
@@ -162,49 +202,54 @@ ggplot(data = lizard_vb_estimates$individual_data,
theme_classic()
#Correlation of parameters
-cor(lizard_vb_estimates$individual_data$ind_max_size_mean,
- lizard_vb_estimates$individual_data$ind_growth_rate_mean)
+cor(lizard_estimates$individual_data$ind_max_size_mean,
+ lizard_estimates$individual_data$ind_growth_rate_mean,
+ method = "spearman")
#Plot function pieces over estimated sizes.
-hmde_plot_de_pieces(model = "vb_multi_ind",
- individual_data = lizard_vb_estimates$individual_data,
- measurement_data = lizard_vb_estimates$measurement_data)
+de_pieces <- hmde_plot_de_pieces(model = "vb_multi_ind",
+ individual_data = lizard_estimates$individual_data,
+ measurement_data = lizard_estimates$measurement_data)
```
At the hyper-parameter level for the whole population we have centre and spread parameters for the log-normal distributions of $S_{max}$ and $\beta$. As before, we can look at these as species-level features.
```{r}
-#Max size
-lizard_vb_estimates$population_data$mean[1] #Raw value
-print(paste0("95% CI for mean log max size: (",
- lizard_vb_estimates$population_data$CI_lower[1], " , ",
- lizard_vb_estimates$population_data$CI_upper[1], ")")) #Raw CI
-
-exp(lizard_vb_estimates$population_data$mean[1]) #In mm units
-print(paste0("95% CI for mean max size in mm: (",
- exp(lizard_vb_estimates$population_data$CI_lower[1]), " , ",
- exp(lizard_vb_estimates$population_data$CI_upper[1]), ")"))
-
-#Standard deviation of underlying normal distribution
-lizard_vb_estimates$population_data$mean[2]
-print(paste0("95% CI for log max size standard deviation: (",
- lizard_vb_estimates$population_data$CI_lower[2], " , ",
- lizard_vb_estimates$population_data$CI_upper[2], ")")) #Raw CI
-
-#Beta
-lizard_vb_estimates$population_data$mean[3] #Raw value
-print(paste0("95% CI for mean log growth par: (",
- lizard_vb_estimates$population_data$CI_lower[3], " , ",
- lizard_vb_estimates$population_data$CI_upper[3], ")")) #Raw CI
-
-exp(lizard_vb_estimates$population_data$mean[3]) #In cm/yr units
-print(paste0("95% CI for mean growth par mm/yr: (",
- exp(lizard_vb_estimates$population_data$CI_lower[3]), " , ",
- exp(lizard_vb_estimates$population_data$CI_upper[3]), ")"))
-
-#Standard deviation of underlying normal distribution
-lizard_vb_estimates$population_data$mean[4]
-print(paste0("95% CI for log growth par standard deviation: (",
- lizard_vb_estimates$population_data$CI_lower[4], " , ",
- lizard_vb_estimates$population_data$CI_upper[4], ")")) #Raw CI
+pars_CI_names <- c(
+ "mean log max size",
+ "mean max size in mm",
+ "log max size standard deviation",
+ "mean log growth par",
+ "mean growth par mm/yr",
+ "log growth par standard deviation"
+)
+#Vector that picks out which pars to be exponentiated
+exp_vec <- c(FALSE, TRUE, FALSE,
+ FALSE, TRUE, FALSE)
+
+#Print mean estimates and CIs
+for(i in 1:nrow(lizard_estimates$population_data)){
+ if(!exp_vec[i]){
+ lizard_estimates$population_data$mean[i]
+ print(paste0("95% CI for ",
+ pars_CI_names[i],
+ ": (",
+ lizard_estimates$population_data$CI_lower[i],
+ ", ",
+ lizard_estimates$population_data$CI_upper[i],
+ ")"))
+ } else {
+ exp(lizard_estimates$population_data$mean[i])
+ print(paste0("95% CI for ",
+ pars_CI_names[i],
+ ": (",
+ exp(lizard_estimates$population_data$CI_lower[i]),
+ ", ",
+ exp(lizard_estimates$population_data$CI_upper[i]),
+ ")"))
+ }
+}
```
+
+## References
+