diff --git a/R/checkers.R b/R/checkers.R index 898a1e0d..a7e41eb8 100644 --- a/R/checkers.R +++ b/R/checkers.R @@ -123,7 +123,6 @@ onset_to_death = NULL, onset_to_recovery = NULL, add_names = NULL, - add_ct = NULL, case_type_probs = NULL, contact_tracing_status_probs = NULL, hosp_risk = NULL, @@ -149,7 +148,6 @@ .check_func_req_args(onset_to_death, func_name = "onset_to_death") .check_func_req_args(onset_to_recovery, func_name = "onset_to_recovery") checkmate::assert_logical(add_names, len = 1) - checkmate::assert_logical(add_ct, len = 1) checkmate::assert_numeric(case_type_probs, len = 3, lower = 0, upper = 1) checkmate::assert_names( names(case_type_probs), diff --git a/R/sim_internal.R b/R/sim_internal.R index 73f77d90..1f77ffd0 100644 --- a/R/sim_internal.R +++ b/R/sim_internal.R @@ -23,7 +23,6 @@ non_hosp_death_risk = NULL, outbreak_start_date, add_names = NULL, - add_ct = NULL, outbreak_size, population_age, case_type_probs = NULL, @@ -142,14 +141,12 @@ ) # add Ct if confirmed - if (add_ct) { - .data <- .add_ct( - .data = .data, - distribution = config$ct_distribution, - config$ct_distribution_params - ) - linelist_cols <- c(linelist_cols, "ct_value") - } + .data <- .add_ct( + .data = .data, + distribution = config$ct_distribution, + config$ct_distribution_params + ) + linelist_cols <- c(linelist_cols, "ct_value") } if (sim_type %in% c("contacts", "outbreak")) { diff --git a/R/sim_linelist.R b/R/sim_linelist.R index 4d244be2..d6ddb62e 100644 --- a/R/sim_linelist.R +++ b/R/sim_linelist.R @@ -67,10 +67,6 @@ #' @param outbreak_start_date A `date` for the start of the outbreak. #' @param add_names A `logical` boolean for whether to add names to each row #' of the line list. Default is `TRUE`. -#' @param add_ct A `logical` boolean for whether to add Ct values to each -#' confirmed case and `NA` otherwise for each case in the line list. -#' Default is `TRUE`. Ct refers to the Cycle threshold from a Real-time -#' PCR or quantitative PCR (qPCR). #' @param outbreak_size A `numeric` vector of length 2 defining the minimum and #' the maximum number of infected individuals for the simulated outbreak. #' Default is `c(10, 1e4)`, so the minimum outbreak size is 10 infected @@ -92,7 +88,7 @@ #' each case type. The names of the vector must be `"suspected"`, `"probable"`, #' `"confirmed"`. Values of each case type must sum to one. #' @param config A list of settings to adjust the randomly sampled delays and -#' Ct values (if `add_ct = TRUE`). See [create_config()] for more information. +#' Ct values. See [create_config()] for more information. #' #' @return A line list `` #' @export @@ -167,7 +163,6 @@ sim_linelist <- function(contact_distribution, non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 1e4), population_age = c(1, 90), case_type_probs = c( @@ -203,7 +198,6 @@ sim_linelist <- function(contact_distribution, onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = add_names, - add_ct = add_ct, case_type_probs = case_type_probs, hosp_risk = hosp_risk, hosp_death_risk = hosp_death_risk, @@ -261,7 +255,6 @@ sim_linelist <- function(contact_distribution, non_hosp_death_risk = non_hosp_death_risk, outbreak_start_date = outbreak_start_date, add_names = add_names, - add_ct = add_ct, outbreak_size = outbreak_size, population_age = population_age, case_type_probs = case_type_probs, diff --git a/R/sim_outbreak.R b/R/sim_outbreak.R index 3f4d169e..2b844730 100644 --- a/R/sim_outbreak.R +++ b/R/sim_outbreak.R @@ -64,7 +64,6 @@ sim_outbreak <- function(contact_distribution, non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 1e4), population_age = c(1, 90), case_type_probs = c( @@ -105,7 +104,6 @@ sim_outbreak <- function(contact_distribution, onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = add_names, - add_ct = add_ct, case_type_probs = case_type_probs, contact_tracing_status_probs = contact_tracing_status_probs, hosp_risk = hosp_risk, @@ -164,7 +162,6 @@ sim_outbreak <- function(contact_distribution, non_hosp_death_risk = non_hosp_death_risk, outbreak_start_date = outbreak_start_date, add_names = add_names, - add_ct = add_ct, outbreak_size = outbreak_size, population_age = population_age, case_type_probs = case_type_probs, diff --git a/_pkgdown.yml b/_pkgdown.yml index e30026ef..d9f0d266 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -18,12 +18,16 @@ reference: - create_config articles: -- title: Package Vignettes - navbar: Package Vignettes +- title: Customising simulated outbreak + navbar: Customising simulated outbreak contents: - age-strat-risks - age-struct-pop - time-varying-cfr +- title: Wrangling and plotting data + navbar: Wrangling and plotting data + contents: + - wrangling-linelist - vis-linelist - title: Developer Documentation navbar: Developer Documentation diff --git a/inst/WORDLIST b/inst/WORDLIST index 32bcd2d2..2615f614 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -2,6 +2,7 @@ aes apyramid bookdown bw +Çetinkaya cfr CMD codecov @@ -13,6 +14,7 @@ COVID Ct ct db +Deon df dist dplyr @@ -27,6 +29,7 @@ facetted ggplot gh github +Grolemund implmented infector integerish @@ -36,6 +39,8 @@ lifecycle Lifecycle linelist lintr +Lusseau +Mancini md MERS olds @@ -52,8 +57,9 @@ qPCR randomNames RECON redocumented -rmarkdown resimulate +rmarkdown +Rundel SARS sensu sim @@ -65,6 +71,7 @@ tabset testthat threejs tidyr +tidyverse Tidyverse visNetwork yaml diff --git a/man/dot-add_date.Rd b/man/dot-add_date.Rd index f0a87cf7..36f7816a 100644 --- a/man/dot-add_date.Rd +++ b/man/dot-add_date.Rd @@ -80,7 +80,7 @@ specified in the \code{config} the \code{non_hosp_death_risk} is interpreted as maximum risk across the epidemic.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} with one more column than input into \code{.data}. diff --git a/man/dot-check_sim_input.Rd b/man/dot-check_sim_input.Rd index 2b44e8da..c1550d0c 100644 --- a/man/dot-check_sim_input.Rd +++ b/man/dot-check_sim_input.Rd @@ -15,7 +15,6 @@ onset_to_death = NULL, onset_to_recovery = NULL, add_names = NULL, - add_ct = NULL, case_type_probs = NULL, contact_tracing_status_probs = NULL, hosp_risk = NULL, @@ -73,11 +72,6 @@ recover get an \code{NA} in the \verb{$date_outcome} line list column.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{case_type_probs}{A named \code{numeric} vector with the probability of each case type. The names of the vector must be \code{"suspected"}, \code{"probable"}, \code{"confirmed"}. Values of each case type must sum to one.} diff --git a/man/dot-sim_internal.Rd b/man/dot-sim_internal.Rd index 6095f279..f100cd59 100644 --- a/man/dot-sim_internal.Rd +++ b/man/dot-sim_internal.Rd @@ -18,7 +18,6 @@ within \pkg{simulist}} non_hosp_death_risk = NULL, outbreak_start_date, add_names = NULL, - add_ct = NULL, outbreak_size, population_age, case_type_probs = NULL, @@ -88,11 +87,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -122,7 +116,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} if \code{sim_type} is \code{"linelist"} or \code{"contacts"}, or a diff --git a/man/dot-sim_network_bp.Rd b/man/dot-sim_network_bp.Rd index b3fe498c..83197fc1 100644 --- a/man/dot-sim_network_bp.Rd +++ b/man/dot-sim_network_bp.Rd @@ -31,7 +31,7 @@ infectious period.} contact being infected by an infected primary contact.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} with the contact and transmission chain data. diff --git a/man/sim_contacts.Rd b/man/sim_contacts.Rd index 8c2c952a..63f555da 100644 --- a/man/sim_contacts.Rd +++ b/man/sim_contacts.Rd @@ -60,7 +60,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A contacts \verb{} diff --git a/man/sim_linelist.Rd b/man/sim_linelist.Rd index b5f0dba9..8acfd0d3 100644 --- a/man/sim_linelist.Rd +++ b/man/sim_linelist.Rd @@ -16,7 +16,6 @@ sim_linelist( non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 10000), population_age = c(1, 90), case_type_probs = c(suspected = 0.2, probable = 0.3, confirmed = 0.5), @@ -82,11 +81,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -111,7 +105,7 @@ each case type. The names of the vector must be \code{"suspected"}, \code{"proba \code{"confirmed"}. Values of each case type must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A line list \verb{} diff --git a/man/sim_outbreak.Rd b/man/sim_outbreak.Rd index 224a93d0..7774bffd 100644 --- a/man/sim_outbreak.Rd +++ b/man/sim_outbreak.Rd @@ -16,7 +16,6 @@ sim_outbreak( non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 10000), population_age = c(1, 90), case_type_probs = c(suspected = 0.2, probable = 0.3, confirmed = 0.5), @@ -84,11 +83,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -118,7 +112,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A list with two elements: diff --git a/tests/testthat/_snaps/sim_linelist.md b/tests/testthat/_snaps/sim_linelist.md index 8411350e..fe7eda1b 100644 --- a/tests/testthat/_snaps/sim_linelist.md +++ b/tests/testthat/_snaps/sim_linelist.md @@ -66,40 +66,6 @@ 11 2023-01-01 2023-01-04 NA 12 2023-01-01 2023-01-03 NA -# sim_linelist works as expected without Ct - - Code - sim_linelist(contact_distribution = contact_distribution, infect_period = infect_period, - prob_infect = 0.5, onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death, - add_ct = FALSE) - Output - id case_name case_type sex age date_onset date_admission - 1 1 Dominic Sundara probable m 35 2023-01-01 - 2 2 Preston Montgomery suspected m 43 2023-01-01 - 3 3 Reece Chittum probable m 1 2023-01-01 - 4 5 Michael Cheek confirmed m 78 2023-01-01 - 5 6 Jennifer Smith confirmed f 22 2023-01-01 - 6 8 Erika Quintero confirmed f 28 2023-01-01 - 7 11 Isaiah Patterson suspected m 46 2023-01-01 2023-01-13 - 8 12 Cicely Anderson suspected f 67 2023-01-01 - 9 13 Michael John probable m 86 2023-01-01 2023-01-01 - 10 18 Giovana Magana Aguirre suspected f 60 2023-01-02 - 11 20 Mudrik al-Hallal suspected m 49 2023-01-02 - 12 22 Tea Slaughter probable f 7 2023-01-02 2023-01-02 - outcome date_outcome date_first_contact date_last_contact - 1 recovered - 2 recovered 2022-12-30 2023-01-05 - 3 recovered 2022-12-30 2023-01-02 - 4 recovered 2022-12-29 2023-01-02 - 5 recovered 2023-01-01 2023-01-03 - 6 recovered 2023-01-03 2023-01-04 - 7 recovered 2023-01-04 2023-01-05 - 8 recovered 2023-01-01 2023-01-04 - 9 died 2023-01-12 2022-12-31 2023-01-03 - 10 recovered 2022-12-30 2023-01-03 - 11 recovered 2023-01-01 2023-01-04 - 12 recovered 2023-01-01 2023-01-03 - # sim_linelist works as expected with anonymous Code diff --git a/tests/testthat/test-checkers.R b/tests/testthat/test-checkers.R index b5297835..af2c1809 100644 --- a/tests/testthat/test-checkers.R +++ b/tests/testthat/test-checkers.R @@ -187,7 +187,6 @@ test_that(".check_sim_input works as expected", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, @@ -217,7 +216,6 @@ test_that(".check_sim_input works as expected", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, @@ -261,7 +259,6 @@ test_that(".check_sim_input works as expected with NA risks", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, diff --git a/tests/testthat/test-sim_linelist.R b/tests/testthat/test-sim_linelist.R index 717d94ed..d28e13e4 100644 --- a/tests/testthat/test-sim_linelist.R +++ b/tests/testthat/test-sim_linelist.R @@ -69,20 +69,6 @@ test_that("sim_linelist works as expected with age-strat risks", { ) }) -test_that("sim_linelist works as expected without Ct", { - set.seed(1) - expect_snapshot( - sim_linelist( - contact_distribution = contact_distribution, - infect_period = infect_period, - prob_infect = 0.5, - onset_to_hosp = onset_to_hosp, - onset_to_death = onset_to_death, - add_ct = FALSE - ) - ) -}) - test_that("sim_linelist works as expected with anonymous", { set.seed(1) expect_snapshot( @@ -246,7 +232,6 @@ test_that("sim_linelist fails as expected with modified config", { prob_infect = 0.5, onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death, - add_ct = TRUE, config = create_config( ct_distribution = "gamma" ) diff --git a/vignettes/design-principles.Rmd b/vignettes/design-principles.Rmd index f20fb54c..fcbd9be1 100644 --- a/vignettes/design-principles.Rmd +++ b/vignettes/design-principles.Rmd @@ -48,6 +48,8 @@ The simulation functions either return a `` or a `list` of `` returned by the functions (or in the case of `sim_outbreak()` a list of two ``s). Instead, we recommend modifying the line list or contact tracing data after the simulation, and provide a vignette to guide users on common data wrangling tasks in `wrangling-linelist.Rmd`. Not including arguments that can remove or add columns to the output ``s reduces the complexity of the functions; and by limiting the simulation function arguments to only parameterise, and not change the dimensionality of, the simulated data, the package is more robust to being used in pipelines or other automated approaches, where the data needs to be predictably formatted. + ## Dependencies The aim is to restrict the number of dependencies to a minimal required set for ease of maintenance. The current hard dependencies are: diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd new file mode 100644 index 00000000..5e7d6e62 --- /dev/null +++ b/vignettes/wrangling-linelist.Rmd @@ -0,0 +1,123 @@ +--- +title: "Wrangling simulated outbreak data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{wrrangling-linelist} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The {simulist} R package can generate line list data (`sim_linelist()`), contact tracing data (`sim_contacts()`), or both (`sim_outbreak()`). By default the line list produced by `sim_linelist()` and `sim_outbreak()` contains 12 columns. Some amount of post-simulation data wrangling may be needed to use the simulated epidemiological case data to certain applications. This vignette demonstrates some common data wrangling tasks that may be performed on simulated line list or contact tracing data. + +```{r setup} +library(simulist) +library(epiparameter) +library(dplyr) +``` + +This vignette provides data wrangling examples using both functions available in the R language (commonly called "base R") as well as using [tidyverse R packages](https://www.tidyverse.org/), which are commonly applied to data science tasks in R. The tidyverse examples are shown by default, but select the "Base R" tab to see the equivalent functionality using base R. There are many other tools for wrangling data in R which are not covered by this vignette (e.g. [{data.table}](https://rdatatable.gitlab.io/data.table/)). + +::: {.alert .alert-info} +See these great resources for more information on general data wrangling in R: + +* [R for Data Science by Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund](https://r4ds.hadley.nz/) +* [{dplyr} R package](https://dplyr.tidyverse.org/) +* [{tidyr} R package](https://github.com/tidyverse/tidyr) +* [Wrangling data frames chapter in An Introduction to R by Alex Douglas, Deon Roos, Francesca Mancini, Ana Couto & David Lusseau](https://intro2r.com/wrangling-data-frames.html) +::: + +## Simulate an outbreak + +To simulate an outbreak we will use the `sim_outbreak()` function from the {simulist} R package. + +::: {.alert .alert-info} +If you are unfamiliar with the {simulist} package or the `sim_outbreak()` function [Get Started vignette](simulist.html) is a great place to start. +::: + +First we load in some data that is required for the outbreak simulation. Data on epidemiological parameters and distributions are read from the {epiparameter} R package. + +```{r read-epidist} +# create contact distribution (not available from {epiparameter} database) +contact_distribution <- epidist( + disease = "COVID-19", + epi_dist = "contact distribution", + prob_distribution = "pois", + prob_distribution_params = c(mean = 2) +) + +# create infectious period (not available from {epiparameter} database) +infect_period <- epidist( + disease = "COVID-19", + epi_dist = "infectious period", + prob_distribution = "gamma", + prob_distribution_params = c(shape = 1, scale = 1) +) + +# get onset to hospital admission from {epiparameter} database +onset_to_hosp <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to hospitalisation", + single_epidist = TRUE +) + +# get onset to death from {epiparameter} database +onset_to_death <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to death", + single_epidist = TRUE +) +``` + +The seed is set to ensure the output of the vignette is consistent. When using {simulist}, setting the seed is not required unless you need to simulate the same line list multiple times. + +```{r, set-seed} +set.seed(123) +``` + +```{r, sim-outbreak} +outbreak <- sim_outbreak( + contact_distribution = contact_distribution, + infect_period = infect_period, + prob_infect = 0.5, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death +) +linelist <- outbreak$linelist +contacts <- outbreak$contacts +``` + +## Removing a line list column {.tabset} + +Not every column in the simulated line list may be required for the use case at hand. In this example we will remove the `$ct_value` column. For instance, if we wanted to simulate an outbreak for which no laboratory testing (e.g Polymerase chain reaction, PCR, testing) was available and thus a Cycle threshold (Ct) value would not be known for confirmed cases. + +### Tidyverse + +```{r, rm-ct-col-tidyverse} +# remove column by name +linelist %>% + select(!ct_value) +``` + +### Base R + +```{r, rm-ct-col-base} +# remove column by numeric column indexing +# ct_value is column 12 (the last column) +linelist[, -12] + +# remove column by column name +linelist[, colnames(linelist) != "ct_value"] + +# remove column by assigning it to NULL +linelist$ct_value <- NULL +linelist +``` + +## {-}