From 91fc8e9c1f376415b6fe351b6ba988a8db3259c0 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:01:00 +0100 Subject: [PATCH 01/10] remove add_ct argument from functions, WIP #41 --- R/checkers.R | 2 -- R/sim_internal.R | 15 ++++++--------- R/sim_linelist.R | 9 +-------- R/sim_outbreak.R | 3 --- man/dot-add_date.Rd | 2 +- man/dot-check_sim_input.Rd | 6 ------ man/dot-sim_internal.Rd | 8 +------- man/dot-sim_network_bp.Rd | 2 +- man/sim_contacts.Rd | 2 +- man/sim_linelist.Rd | 8 +------- man/sim_outbreak.Rd | 8 +------- 11 files changed, 13 insertions(+), 52 deletions(-) diff --git a/R/checkers.R b/R/checkers.R index 898a1e0d..a7e41eb8 100644 --- a/R/checkers.R +++ b/R/checkers.R @@ -123,7 +123,6 @@ onset_to_death = NULL, onset_to_recovery = NULL, add_names = NULL, - add_ct = NULL, case_type_probs = NULL, contact_tracing_status_probs = NULL, hosp_risk = NULL, @@ -149,7 +148,6 @@ .check_func_req_args(onset_to_death, func_name = "onset_to_death") .check_func_req_args(onset_to_recovery, func_name = "onset_to_recovery") checkmate::assert_logical(add_names, len = 1) - checkmate::assert_logical(add_ct, len = 1) checkmate::assert_numeric(case_type_probs, len = 3, lower = 0, upper = 1) checkmate::assert_names( names(case_type_probs), diff --git a/R/sim_internal.R b/R/sim_internal.R index 73f77d90..1f77ffd0 100644 --- a/R/sim_internal.R +++ b/R/sim_internal.R @@ -23,7 +23,6 @@ non_hosp_death_risk = NULL, outbreak_start_date, add_names = NULL, - add_ct = NULL, outbreak_size, population_age, case_type_probs = NULL, @@ -142,14 +141,12 @@ ) # add Ct if confirmed - if (add_ct) { - .data <- .add_ct( - .data = .data, - distribution = config$ct_distribution, - config$ct_distribution_params - ) - linelist_cols <- c(linelist_cols, "ct_value") - } + .data <- .add_ct( + .data = .data, + distribution = config$ct_distribution, + config$ct_distribution_params + ) + linelist_cols <- c(linelist_cols, "ct_value") } if (sim_type %in% c("contacts", "outbreak")) { diff --git a/R/sim_linelist.R b/R/sim_linelist.R index 4d244be2..d6ddb62e 100644 --- a/R/sim_linelist.R +++ b/R/sim_linelist.R @@ -67,10 +67,6 @@ #' @param outbreak_start_date A `date` for the start of the outbreak. #' @param add_names A `logical` boolean for whether to add names to each row #' of the line list. Default is `TRUE`. -#' @param add_ct A `logical` boolean for whether to add Ct values to each -#' confirmed case and `NA` otherwise for each case in the line list. -#' Default is `TRUE`. Ct refers to the Cycle threshold from a Real-time -#' PCR or quantitative PCR (qPCR). #' @param outbreak_size A `numeric` vector of length 2 defining the minimum and #' the maximum number of infected individuals for the simulated outbreak. #' Default is `c(10, 1e4)`, so the minimum outbreak size is 10 infected @@ -92,7 +88,7 @@ #' each case type. The names of the vector must be `"suspected"`, `"probable"`, #' `"confirmed"`. Values of each case type must sum to one. #' @param config A list of settings to adjust the randomly sampled delays and -#' Ct values (if `add_ct = TRUE`). See [create_config()] for more information. +#' Ct values. See [create_config()] for more information. #' #' @return A line list `` #' @export @@ -167,7 +163,6 @@ sim_linelist <- function(contact_distribution, non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 1e4), population_age = c(1, 90), case_type_probs = c( @@ -203,7 +198,6 @@ sim_linelist <- function(contact_distribution, onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = add_names, - add_ct = add_ct, case_type_probs = case_type_probs, hosp_risk = hosp_risk, hosp_death_risk = hosp_death_risk, @@ -261,7 +255,6 @@ sim_linelist <- function(contact_distribution, non_hosp_death_risk = non_hosp_death_risk, outbreak_start_date = outbreak_start_date, add_names = add_names, - add_ct = add_ct, outbreak_size = outbreak_size, population_age = population_age, case_type_probs = case_type_probs, diff --git a/R/sim_outbreak.R b/R/sim_outbreak.R index 3f4d169e..2b844730 100644 --- a/R/sim_outbreak.R +++ b/R/sim_outbreak.R @@ -64,7 +64,6 @@ sim_outbreak <- function(contact_distribution, non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 1e4), population_age = c(1, 90), case_type_probs = c( @@ -105,7 +104,6 @@ sim_outbreak <- function(contact_distribution, onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = add_names, - add_ct = add_ct, case_type_probs = case_type_probs, contact_tracing_status_probs = contact_tracing_status_probs, hosp_risk = hosp_risk, @@ -164,7 +162,6 @@ sim_outbreak <- function(contact_distribution, non_hosp_death_risk = non_hosp_death_risk, outbreak_start_date = outbreak_start_date, add_names = add_names, - add_ct = add_ct, outbreak_size = outbreak_size, population_age = population_age, case_type_probs = case_type_probs, diff --git a/man/dot-add_date.Rd b/man/dot-add_date.Rd index f0a87cf7..36f7816a 100644 --- a/man/dot-add_date.Rd +++ b/man/dot-add_date.Rd @@ -80,7 +80,7 @@ specified in the \code{config} the \code{non_hosp_death_risk} is interpreted as maximum risk across the epidemic.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} with one more column than input into \code{.data}. diff --git a/man/dot-check_sim_input.Rd b/man/dot-check_sim_input.Rd index 2b44e8da..c1550d0c 100644 --- a/man/dot-check_sim_input.Rd +++ b/man/dot-check_sim_input.Rd @@ -15,7 +15,6 @@ onset_to_death = NULL, onset_to_recovery = NULL, add_names = NULL, - add_ct = NULL, case_type_probs = NULL, contact_tracing_status_probs = NULL, hosp_risk = NULL, @@ -73,11 +72,6 @@ recover get an \code{NA} in the \verb{$date_outcome} line list column.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{case_type_probs}{A named \code{numeric} vector with the probability of each case type. The names of the vector must be \code{"suspected"}, \code{"probable"}, \code{"confirmed"}. Values of each case type must sum to one.} diff --git a/man/dot-sim_internal.Rd b/man/dot-sim_internal.Rd index 6095f279..f100cd59 100644 --- a/man/dot-sim_internal.Rd +++ b/man/dot-sim_internal.Rd @@ -18,7 +18,6 @@ within \pkg{simulist}} non_hosp_death_risk = NULL, outbreak_start_date, add_names = NULL, - add_ct = NULL, outbreak_size, population_age, case_type_probs = NULL, @@ -88,11 +87,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -122,7 +116,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} if \code{sim_type} is \code{"linelist"} or \code{"contacts"}, or a diff --git a/man/dot-sim_network_bp.Rd b/man/dot-sim_network_bp.Rd index b3fe498c..83197fc1 100644 --- a/man/dot-sim_network_bp.Rd +++ b/man/dot-sim_network_bp.Rd @@ -31,7 +31,7 @@ infectious period.} contact being infected by an infected primary contact.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A \verb{} with the contact and transmission chain data. diff --git a/man/sim_contacts.Rd b/man/sim_contacts.Rd index 8c2c952a..63f555da 100644 --- a/man/sim_contacts.Rd +++ b/man/sim_contacts.Rd @@ -60,7 +60,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A contacts \verb{} diff --git a/man/sim_linelist.Rd b/man/sim_linelist.Rd index b5f0dba9..8acfd0d3 100644 --- a/man/sim_linelist.Rd +++ b/man/sim_linelist.Rd @@ -16,7 +16,6 @@ sim_linelist( non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 10000), population_age = c(1, 90), case_type_probs = c(suspected = 0.2, probable = 0.3, confirmed = 0.5), @@ -82,11 +81,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -111,7 +105,7 @@ each case type. The names of the vector must be \code{"suspected"}, \code{"proba \code{"confirmed"}. Values of each case type must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A line list \verb{} diff --git a/man/sim_outbreak.Rd b/man/sim_outbreak.Rd index 224a93d0..7774bffd 100644 --- a/man/sim_outbreak.Rd +++ b/man/sim_outbreak.Rd @@ -16,7 +16,6 @@ sim_outbreak( non_hosp_death_risk = 0.05, outbreak_start_date = as.Date("2023-01-01"), add_names = TRUE, - add_ct = TRUE, outbreak_size = c(10, 10000), population_age = c(1, 90), case_type_probs = c(suspected = 0.2, probable = 0.3, confirmed = 0.5), @@ -84,11 +83,6 @@ maximum risk across the epidemic.} \item{add_names}{A \code{logical} boolean for whether to add names to each row of the line list. Default is \code{TRUE}.} -\item{add_ct}{A \code{logical} boolean for whether to add Ct values to each -confirmed case and \code{NA} otherwise for each case in the line list. -Default is \code{TRUE}. Ct refers to the Cycle threshold from a Real-time -PCR or quantitative PCR (qPCR).} - \item{outbreak_size}{A \code{numeric} vector of length 2 defining the minimum and the maximum number of infected individuals for the simulated outbreak. Default is \code{c(10, 1e4)}, so the minimum outbreak size is 10 infected @@ -118,7 +112,7 @@ be \code{"under_followup"}, \code{"lost_to_followup"}, \code{"unknown"}. Values contact tracing status must sum to one.} \item{config}{A list of settings to adjust the randomly sampled delays and -Ct values (if \code{add_ct = TRUE}). See \code{\link[=create_config]{create_config()}} for more information.} +Ct values. See \code{\link[=create_config]{create_config()}} for more information.} } \value{ A list with two elements: From 07c971bb8eed4d26b7fe68bdcbbd508c28eab15b Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:01:32 +0100 Subject: [PATCH 02/10] remove add_ct argument from tests, WIP #41 --- tests/testthat/_snaps/sim_linelist.md | 34 --------------------------- tests/testthat/test-checkers.R | 3 --- tests/testthat/test-sim_linelist.R | 15 ------------ 3 files changed, 52 deletions(-) diff --git a/tests/testthat/_snaps/sim_linelist.md b/tests/testthat/_snaps/sim_linelist.md index 8411350e..fe7eda1b 100644 --- a/tests/testthat/_snaps/sim_linelist.md +++ b/tests/testthat/_snaps/sim_linelist.md @@ -66,40 +66,6 @@ 11 2023-01-01 2023-01-04 NA 12 2023-01-01 2023-01-03 NA -# sim_linelist works as expected without Ct - - Code - sim_linelist(contact_distribution = contact_distribution, infect_period = infect_period, - prob_infect = 0.5, onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death, - add_ct = FALSE) - Output - id case_name case_type sex age date_onset date_admission - 1 1 Dominic Sundara probable m 35 2023-01-01 - 2 2 Preston Montgomery suspected m 43 2023-01-01 - 3 3 Reece Chittum probable m 1 2023-01-01 - 4 5 Michael Cheek confirmed m 78 2023-01-01 - 5 6 Jennifer Smith confirmed f 22 2023-01-01 - 6 8 Erika Quintero confirmed f 28 2023-01-01 - 7 11 Isaiah Patterson suspected m 46 2023-01-01 2023-01-13 - 8 12 Cicely Anderson suspected f 67 2023-01-01 - 9 13 Michael John probable m 86 2023-01-01 2023-01-01 - 10 18 Giovana Magana Aguirre suspected f 60 2023-01-02 - 11 20 Mudrik al-Hallal suspected m 49 2023-01-02 - 12 22 Tea Slaughter probable f 7 2023-01-02 2023-01-02 - outcome date_outcome date_first_contact date_last_contact - 1 recovered - 2 recovered 2022-12-30 2023-01-05 - 3 recovered 2022-12-30 2023-01-02 - 4 recovered 2022-12-29 2023-01-02 - 5 recovered 2023-01-01 2023-01-03 - 6 recovered 2023-01-03 2023-01-04 - 7 recovered 2023-01-04 2023-01-05 - 8 recovered 2023-01-01 2023-01-04 - 9 died 2023-01-12 2022-12-31 2023-01-03 - 10 recovered 2022-12-30 2023-01-03 - 11 recovered 2023-01-01 2023-01-04 - 12 recovered 2023-01-01 2023-01-03 - # sim_linelist works as expected with anonymous Code diff --git a/tests/testthat/test-checkers.R b/tests/testthat/test-checkers.R index b5297835..af2c1809 100644 --- a/tests/testthat/test-checkers.R +++ b/tests/testthat/test-checkers.R @@ -187,7 +187,6 @@ test_that(".check_sim_input works as expected", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, @@ -217,7 +216,6 @@ test_that(".check_sim_input works as expected", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, @@ -261,7 +259,6 @@ test_that(".check_sim_input works as expected with NA risks", { onset_to_death = onset_to_death, onset_to_recovery = onset_to_recovery, add_names = TRUE, - add_ct = FALSE, case_type_probs = c( suspected = 0.2, probable = 0.3, diff --git a/tests/testthat/test-sim_linelist.R b/tests/testthat/test-sim_linelist.R index 717d94ed..d28e13e4 100644 --- a/tests/testthat/test-sim_linelist.R +++ b/tests/testthat/test-sim_linelist.R @@ -69,20 +69,6 @@ test_that("sim_linelist works as expected with age-strat risks", { ) }) -test_that("sim_linelist works as expected without Ct", { - set.seed(1) - expect_snapshot( - sim_linelist( - contact_distribution = contact_distribution, - infect_period = infect_period, - prob_infect = 0.5, - onset_to_hosp = onset_to_hosp, - onset_to_death = onset_to_death, - add_ct = FALSE - ) - ) -}) - test_that("sim_linelist works as expected with anonymous", { set.seed(1) expect_snapshot( @@ -246,7 +232,6 @@ test_that("sim_linelist fails as expected with modified config", { prob_infect = 0.5, onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death, - add_ct = TRUE, config = create_config( ct_distribution = "gamma" ) From f473674db1e382a39a9502041e142ea4922c9f9e Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:09:22 +0100 Subject: [PATCH 03/10] added wrangling-linelist vignette --- vignettes/wrangling-linelist.Rmd | 128 +++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100644 vignettes/wrangling-linelist.Rmd diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd new file mode 100644 index 00000000..c8c9006a --- /dev/null +++ b/vignettes/wrangling-linelist.Rmd @@ -0,0 +1,128 @@ +--- +title: "Wrangling simulated outbreak data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{wrrangling-linelist} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +The {simulist} R package can generate line list data (`sim_linelist()`), contact tracing data (`sim_contacts()`), or both (`sim_outbreak()`). By default the line list produced by `sim_linelist()` and `sim_outbreak()` contains 12 columns. Some amount of post-simulation data wrangling may be needed to use the simulated epidemiological case data to certain applications. This vignette demonstrates some common data wrangling tasks that may be performed on simulated line list or contact tracing data. + +```{r setup} +library(simulist) +library(epiparameter) +library(dplyr) +``` + +This vignette provides data wrangling examples using both functions available in the R language (commonly called "base R") as well as using [tidyverse R packages](https://www.tidyverse.org/), which are commonly applied to data science tasks in R. The tidyverse examples are shown by default, but select the "Base R" tab to see the equivalent functionality using base R. There are many other tools for wrangling data in R which are not covered by this vignette (e.g. [{data.table}](https://rdatatable.gitlab.io/data.table/)). + +::: {.alert .alert-info} +See these great resources for more information on general data wrangling in R: + +* [R for Data Science by Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund](https://r4ds.hadley.nz/) +* [{dplyr} R package](https://dplyr.tidyverse.org/) +* [{tidyr} R package](https://github.com/tidyverse/tidyr) +* [Wrangling data frames chapter in An Introduction to R by Alex Douglas, Deon Roos, Francesca Mancini, Ana Couto & David Lusseau](https://intro2r.com/wrangling-data-frames.html) +::: + +## Simulate an outbreak + +To simulate an outbreak we will use the `sim_outbreak()` function from the {simulist} R package. + +::: {.alert .alert-info} +If you are unfamiliar with the {simulist} package or the `sim_outbreak()` function [Get Started vignette](simulist.html) is a great place to start. +::: + +First we load in some data that is required for the outbreak simulation. Data on epidemiological parameters and distributions are read from the {epiparameter} R package. + +```{r read-epidist} +# create contact distribution (not available from {epiparameter} database) +contact_distribution <- epidist( + disease = "COVID-19", + epi_dist = "contact distribution", + prob_distribution = "pois", + prob_distribution_params = c(mean = 2) +) + +# create infectious period (not available from {epiparameter} database) +infect_period <- epidist( + disease = "COVID-19", + epi_dist = "infectious period", + prob_distribution = "gamma", + prob_distribution_params = c(shape = 1, scale = 1) +) + +# get onset to hospital admission from {epiparameter} database +onset_to_hosp <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to hospitalisation", + single_epidist = TRUE +) + +# get onset to death from {epiparameter} database +onset_to_death <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to death", + single_epidist = TRUE +) +``` + +The seed is set to ensure the output of the vignette is consistent. When using {simulist}, setting the seed is not required unless you need to simulate the same line list multiple times. + +```{r, set-seed} +set.seed(123) +``` + +```{r, sim-outbreak} +outbreak <- sim_outbreak( + contact_distribution = contact_distribution, + infect_period = infect_period, + prob_infect = 0.5, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death +) +linelist <- outbreak$linelist +contacts <- outbreak$contacts +``` + +## Removing a line list column {.tabset} + +Not every column in the simulated line list may be required for the use case at hand. In this example we will remove the `$ct_value` column. For instance, if we wanted to simulate an outbreak for which no laboratory testing (e.g Polymerase chain reaction, PCR, testing) was available and thus a Cycle threshold (Ct) value would not be known for confirmed cases. + +### Tidyverse + +```{r, rm-ct-col-tidyverse} +# remove column by name +linelist <- linelist %>% + select(!ct_value) +linelist +``` + +### Base R + +```{r, rm-ct-col-base, eval=FALSE} +# remove column by assigning it to NULL +linelist$ct_value <- NULL + +# remove column by numeric column indexing +# ct_value is column 12 (the last column) +linelist <- linelist[, 1:11] + +# remove column by column name +linelist <- linelist[, !colnames(linelist) %in% "ct_value"] +linelist +``` + +```{r, rm-ct-col-base2, eval=TRUE, echo=FALSE} +linelist +``` + +## {-} From 9bf2668821e54806128ba1a469eaea002ab465b8 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:13:33 +0100 Subject: [PATCH 04/10] added bullet point to design-principles vignette on simulation argument to parameterise and not change dimensions of data --- vignettes/design-principles.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/design-principles.Rmd b/vignettes/design-principles.Rmd index f20fb54c..fcbd9be1 100644 --- a/vignettes/design-principles.Rmd +++ b/vignettes/design-principles.Rmd @@ -48,6 +48,8 @@ The simulation functions either return a `` or a `list` of `` returned by the functions (or in the case of `sim_outbreak()` a list of two ``s). Instead, we recommend modifying the line list or contact tracing data after the simulation, and provide a vignette to guide users on common data wrangling tasks in `wrangling-linelist.Rmd`. Not including arguments that can remove or add columns to the output ``s reduces the complexity of the functions; and by limiting the simulation function arguments to only parameterise, and not change the dimensionality of, the simulated data, the package is more robust to being used in pipelines or other automated approaches, where the data needs to be predictably formatted. + ## Dependencies The aim is to restrict the number of dependencies to a minimal required set for ease of maintenance. The current hard dependencies are: From 1cc27b53a47dfc02f5c2a7bdba3f8eccd0a5cb55 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:13:58 +0100 Subject: [PATCH 05/10] updated _pkgdown.yml articles --- _pkgdown.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index e30026ef..d9f0d266 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -18,12 +18,16 @@ reference: - create_config articles: -- title: Package Vignettes - navbar: Package Vignettes +- title: Customising simulated outbreak + navbar: Customising simulated outbreak contents: - age-strat-risks - age-struct-pop - time-varying-cfr +- title: Wrangling and plotting data + navbar: Wrangling and plotting data + contents: + - wrangling-linelist - vis-linelist - title: Developer Documentation navbar: Developer Documentation From f293a56fb3d4386a23e7fca2241d656af712c19a Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:14:14 +0100 Subject: [PATCH 06/10] linted wrangling-linelist vignette --- vignettes/wrangling-linelist.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd index c8c9006a..27044684 100644 --- a/vignettes/wrangling-linelist.Rmd +++ b/vignettes/wrangling-linelist.Rmd @@ -83,10 +83,10 @@ set.seed(123) ```{r, sim-outbreak} outbreak <- sim_outbreak( - contact_distribution = contact_distribution, - infect_period = infect_period, - prob_infect = 0.5, - onset_to_hosp = onset_to_hosp, + contact_distribution = contact_distribution, + infect_period = infect_period, + prob_infect = 0.5, + onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death ) linelist <- outbreak$linelist @@ -117,7 +117,7 @@ linelist$ct_value <- NULL linelist <- linelist[, 1:11] # remove column by column name -linelist <- linelist[, !colnames(linelist) %in% "ct_value"] +linelist <- linelist[, !colnames(linelist) == "ct_value"] linelist ``` From dd2a42e1146dd36f792f0b6f33fd48e5a5eb7699 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 25 Apr 2024 17:14:27 +0100 Subject: [PATCH 07/10] updated WORDLIST --- inst/WORDLIST | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/inst/WORDLIST b/inst/WORDLIST index 32bcd2d2..2615f614 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -2,6 +2,7 @@ aes apyramid bookdown bw +Çetinkaya cfr CMD codecov @@ -13,6 +14,7 @@ COVID Ct ct db +Deon df dist dplyr @@ -27,6 +29,7 @@ facetted ggplot gh github +Grolemund implmented infector integerish @@ -36,6 +39,8 @@ lifecycle Lifecycle linelist lintr +Lusseau +Mancini md MERS olds @@ -52,8 +57,9 @@ qPCR randomNames RECON redocumented -rmarkdown resimulate +rmarkdown +Rundel SARS sensu sim @@ -65,6 +71,7 @@ tabset testthat threejs tidyr +tidyverse Tidyverse visNetwork yaml From d6abbbe6e3c01cfc3807ba7dbd04819581d740a6 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Mon, 29 Apr 2024 12:30:29 +0100 Subject: [PATCH 08/10] Use negation for base R col subsetting Co-authored-by: Hugo Gruson <10783929+Bisaloo@users.noreply.github.com> --- vignettes/wrangling-linelist.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd index 27044684..b7b60140 100644 --- a/vignettes/wrangling-linelist.Rmd +++ b/vignettes/wrangling-linelist.Rmd @@ -114,7 +114,7 @@ linelist$ct_value <- NULL # remove column by numeric column indexing # ct_value is column 12 (the last column) -linelist <- linelist[, 1:11] +linelist <- linelist[, -12] # remove column by column name linelist <- linelist[, !colnames(linelist) == "ct_value"] From ae9ea9c3f2ba63fa62c6df9591eeb235858c498a Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Mon, 29 Apr 2024 12:31:05 +0100 Subject: [PATCH 09/10] Use != in base R subsetting Co-authored-by: Hugo Gruson <10783929+Bisaloo@users.noreply.github.com> --- vignettes/wrangling-linelist.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd index b7b60140..5c139981 100644 --- a/vignettes/wrangling-linelist.Rmd +++ b/vignettes/wrangling-linelist.Rmd @@ -117,7 +117,7 @@ linelist$ct_value <- NULL linelist <- linelist[, -12] # remove column by column name -linelist <- linelist[, !colnames(linelist) == "ct_value"] +linelist <- linelist[, colnames(linelist) != "ct_value"] linelist ``` From d7030763de7add28693ccee7aaaebd54fca767fb Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Mon, 29 Apr 2024 13:15:50 +0100 Subject: [PATCH 10/10] removed eval=FALSE code from wrangling-linelist vignette Co-authored-by: Hugo Gruson <10783929+Bisaloo@users.noreply.github.com> --- vignettes/wrangling-linelist.Rmd | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/vignettes/wrangling-linelist.Rmd b/vignettes/wrangling-linelist.Rmd index 5c139981..5e7d6e62 100644 --- a/vignettes/wrangling-linelist.Rmd +++ b/vignettes/wrangling-linelist.Rmd @@ -101,27 +101,22 @@ Not every column in the simulated line list may be required for the use case at ```{r, rm-ct-col-tidyverse} # remove column by name -linelist <- linelist %>% +linelist %>% select(!ct_value) -linelist ``` ### Base R -```{r, rm-ct-col-base, eval=FALSE} -# remove column by assigning it to NULL -linelist$ct_value <- NULL - +```{r, rm-ct-col-base} # remove column by numeric column indexing # ct_value is column 12 (the last column) -linelist <- linelist[, -12] +linelist[, -12] # remove column by column name -linelist <- linelist[, colnames(linelist) != "ct_value"] -linelist -``` +linelist[, colnames(linelist) != "ct_value"] -```{r, rm-ct-col-base2, eval=TRUE, echo=FALSE} +# remove column by assigning it to NULL +linelist$ct_value <- NULL linelist ```