Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add anonymous line lists #106

Merged
merged 10 commits into from
May 3, 2024
6 changes: 5 additions & 1 deletion R/add_cols.R
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ NULL
#' is `pois`).
#' @param ... [dots] Extra arguments to be passed to the distribution function
#' given in the `distribution` argument.
#' @inheritParams sim_linelist
#'
#' @name .add_info
#'
Expand All @@ -225,8 +226,11 @@ NULL
NULL

#' @name .add_info
.add_names <- function(.data) {
.add_names <- function(.data, anonymise = FALSE) {
.data$case_name <- .sample_names(.data = .data)
if (anonymise) {
.data$case_name <- .anonymise(.data$case_name)
}

# left join corresponding names to infectors preserving column and row order
infector_names <- data.frame(id = .data$id, infector_name = .data$case_name)
Expand Down
4 changes: 2 additions & 2 deletions R/checkers.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
onset_to_hosp = NULL,
onset_to_death = NULL,
onset_to_recovery = NULL,
add_names = NULL,
anonymise = NULL,
case_type_probs = NULL,
contact_tracing_status_probs = NULL,
hosp_risk = NULL,
Expand All @@ -147,7 +147,7 @@
.check_func_req_args(onset_to_hosp, func_name = "onset_to_hosp")
.check_func_req_args(onset_to_death, func_name = "onset_to_death")
.check_func_req_args(onset_to_recovery, func_name = "onset_to_recovery")
checkmate::assert_logical(add_names, len = 1)
checkmate::assert_logical(anonymise, len = 1)
checkmate::assert_numeric(case_type_probs, len = 3, lower = 0, upper = 1)
checkmate::assert_names(
names(case_type_probs),
Expand Down
2 changes: 2 additions & 0 deletions R/sim_contacts.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ sim_contacts <- function(contact_distribution,
infect_period,
prob_infect,
outbreak_start_date = as.Date("2023-01-01"),
anonymise = FALSE,
outbreak_size = c(10, 1e4),
population_age = c(1, 90),
contact_tracing_status_probs = c(
Expand Down Expand Up @@ -78,6 +79,7 @@ sim_contacts <- function(contact_distribution,
infect_period = infect_period,
prob_infect = prob_infect,
outbreak_start_date = outbreak_start_date,
anonymise = anonymise,
outbreak_size = outbreak_size,
population_age = population_age,
contact_tracing_status_probs = contact_tracing_status_probs,
Expand Down
10 changes: 4 additions & 6 deletions R/sim_internal.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
hosp_death_risk = NULL,
non_hosp_death_risk = NULL,
outbreak_start_date,
add_names = NULL,
anonymise = NULL,
outbreak_size,
population_age,
case_type_probs = NULL,
Expand Down Expand Up @@ -127,10 +127,8 @@
"outcome", "date_outcome", "date_first_contact", "date_last_contact"
)

if (add_names) {
.data <- .add_names(.data = .data)
linelist_cols <- append(linelist_cols, "case_name", after = 1)
}
.data <- .add_names(.data = .data, anonymise = anonymise)
linelist_cols <- append(linelist_cols, "case_name", after = 1)

# add confirmed, probable, suspected case types
.data$case_type[.data$infected == "infected"] <- sample(
Expand All @@ -151,7 +149,7 @@

if (sim_type %in% c("contacts", "outbreak")) {
if (!"infector_name" %in% colnames(.data)) {
.data <- .add_names(.data = .data)
.data <- .add_names(.data = .data, anonymise = anonymise)
}

contacts_tbl <- subset(
Expand Down
10 changes: 5 additions & 5 deletions R/sim_linelist.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@
#' specified in the `config` the `non_hosp_death_risk` is interpreted as the
#' maximum risk across the epidemic.
#' @param outbreak_start_date A `date` for the start of the outbreak.
#' @param add_names A `logical` boolean for whether to add names to each row
#' of the line list. Default is `TRUE`.
#' @param anonymise A `logical` boolean for whether case names should be
#' anonymised. Default is `FALSE`.
#' @param outbreak_size A `numeric` vector of length 2 defining the minimum and
#' the maximum number of infected individuals for the simulated outbreak.
#' Default is `c(10, 1e4)`, so the minimum outbreak size is 10 infected
Expand Down Expand Up @@ -162,7 +162,7 @@ sim_linelist <- function(contact_distribution,
hosp_death_risk = 0.5,
non_hosp_death_risk = 0.05,
outbreak_start_date = as.Date("2023-01-01"),
add_names = TRUE,
anonymise = FALSE,
outbreak_size = c(10, 1e4),
population_age = c(1, 90),
case_type_probs = c(
Expand Down Expand Up @@ -197,7 +197,7 @@ sim_linelist <- function(contact_distribution,
onset_to_hosp = onset_to_hosp,
onset_to_death = onset_to_death,
onset_to_recovery = onset_to_recovery,
add_names = add_names,
anonymise = anonymise,
case_type_probs = case_type_probs,
hosp_risk = hosp_risk,
hosp_death_risk = hosp_death_risk,
Expand Down Expand Up @@ -254,7 +254,7 @@ sim_linelist <- function(contact_distribution,
hosp_death_risk = hosp_death_risk,
non_hosp_death_risk = non_hosp_death_risk,
outbreak_start_date = outbreak_start_date,
add_names = add_names,
anonymise = anonymise,
outbreak_size = outbreak_size,
population_age = population_age,
case_type_probs = case_type_probs,
Expand Down
6 changes: 3 additions & 3 deletions R/sim_outbreak.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ sim_outbreak <- function(contact_distribution,
hosp_death_risk = 0.5,
non_hosp_death_risk = 0.05,
outbreak_start_date = as.Date("2023-01-01"),
add_names = TRUE,
anonymise = FALSE,
outbreak_size = c(10, 1e4),
population_age = c(1, 90),
case_type_probs = c(
Expand Down Expand Up @@ -103,7 +103,7 @@ sim_outbreak <- function(contact_distribution,
onset_to_hosp = onset_to_hosp,
onset_to_death = onset_to_death,
onset_to_recovery = onset_to_recovery,
add_names = add_names,
anonymise = anonymise,
case_type_probs = case_type_probs,
contact_tracing_status_probs = contact_tracing_status_probs,
hosp_risk = hosp_risk,
Expand Down Expand Up @@ -161,7 +161,7 @@ sim_outbreak <- function(contact_distribution,
hosp_death_risk = hosp_death_risk,
non_hosp_death_risk = non_hosp_death_risk,
outbreak_start_date = outbreak_start_date,
add_names = add_names,
anonymise = anonymise,
outbreak_size = outbreak_size,
population_age = population_age,
case_type_probs = case_type_probs,
Expand Down
52 changes: 52 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,58 @@
names_mf
}

#' Anonymise names
#'
#' @description
#' A simple algorithm to replace names with an alphanumeric string with an
#' fixed number of characters (i.e. [nchar()]) specified by `string_len`.
#'
#' @param x A vector of `character` strings.
#' @param string_len A single `numeric` specifying the number of alphanumeric
#' characters to use for each anonymising `character` string.
#' Default is `10`.
#'
#' @return A vector of `character` strings of equal length to the input.
#' @keywords internal
.anonymise <- function(x, string_len = 10) {
# find any NAs in input vector
na_idx <- is.na(x)
# make copy of x
x_ <- x
# remove NAs from vector for anonymising
x <- x[!is.na(x)]
# unique vector so repeated strings get the same anon string
uniq_x <- unique(x)
# create characters to sample
chars <- c(letters, LETTERS, 1:9)
# while loop to ensure all anon strings are unique
uniq_anon <- TRUE
while (uniq_anon) {
# sample and combine anonymised string
anon <- vapply(
uniq_x,
function(x) {
paste(
sample(x = chars, size = string_len, replace = TRUE),
collapse = ""
)
},
FUN.VALUE = character(1)
)
if (anyDuplicated(anon) == 0) {
uniq_anon <- FALSE
}
}
# insert anon strings and NAs at original index position
out <- rep(NA, times = length(na_idx))
for (i in seq_along(anon)) {
# which to drop NAs
out[which(x_ == uniq_x[i])] <- anon[i]
}
# return character vector
out
}

#' Check if \R object is a single `NA`
#'
#' Check if an \R object is specifically a single logical [`NA`] (i.e.
Expand Down
1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ packagename
params
parameterise
parameterised
PII
pkgdown
Poisson
Pratik
Expand Down
5 changes: 4 additions & 1 deletion man/dot-add_info.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/dot-anonymise.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/dot-check_sim_input.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/dot-sim_internal.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/sim_contacts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/sim_linelist.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/sim_outbreak.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading