From 8f9d7b7fd7de3ca47f6c336332214319a02f2922 Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 14:17:16 -0700 Subject: [PATCH 01/13] possible rep_sample_n() re-implementation (#279) also adds a new rep_slice_sample wrapper that has a more similar interface to dplyr::slice_sample(). still need to extend unit testing and rewrite examples. --- NAMESPACE | 2 +- R/rep_sample_n.R | 54 +++++++++++------------------- man/get_confidence_interval.Rd | 2 +- man/rep_sample_n.Rd | 10 +++++- tests/testthat/test-rep_sample_n.R | 15 +++++---- 5 files changed, 39 insertions(+), 44 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ebcfda1b..b66dea2f 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(hypothesize) export(p_value) export(prop_test) export(rep_sample_n) +export(rep_slice_sample) export(shade_ci) export(shade_confidence_interval) export(shade_p_value) @@ -28,7 +29,6 @@ export(visualise) export(visualize) importFrom(dplyr,bind_rows) importFrom(dplyr,group_by) -importFrom(dplyr,inner_join) importFrom(dplyr,mutate_if) importFrom(dplyr,n) importFrom(dplyr,one_of) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 19e882c6..e0f6557b 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -14,6 +14,11 @@ #' #' @return A tibble of size `rep` times `size` rows corresponding to `rep` #' samples of size n = `size` from `tbl`. +#' +#' @details The [dplyr::sample_n()] function from that `rep_sample_n()` function +#' was originally written to supplement has been superseded +#' by [dplyr::slice_sample()]. `rep_slice_sample()` provides a light wrapper +#' around `rep_sample_n()` that has a more similar interface to `slice_sample()` #' #' @examples #' suppressPackageStartupMessages(library(dplyr)) @@ -39,14 +44,8 @@ #' geom_density() + #' labs(x = "p_hat", y = "Number of samples", #' title = "Sampling distribution of p_hat from 1000 samples of size 50") -#' -#' @importFrom dplyr pull -#' @importFrom dplyr inner_join -#' @importFrom dplyr group_by #' @export rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { - n <- nrow(tbl) - check_type(tbl, is.data.frame) check_type(size, is.numeric) check_type(replace, is.logical) @@ -55,33 +54,20 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(prob, is.numeric) } - # assign non-uniform probabilities - # there should be a better way!! - # prob needs to be nrow(tbl) -- not just number of factor levels - if (!is.null(prob)) { - if (length(prob) != n) { - stop_glue( - "The argument `prob` must have length `nrow(tbl)` = {nrow(tbl)}" - ) - } - - prob <- tibble::tibble(vals = levels(dplyr::pull(tbl, 1))) %>% - dplyr::mutate(probs = prob) %>% - dplyr::inner_join(tbl) %>% - dplyr::select(probs) %>% - dplyr::pull() - } + 1:reps %>% + purrr::map_dfr( + ~ tbl %>% + dplyr::slice_sample(n = size, weight_by = prob, replace = replace) + ) %>% + dplyr::mutate( + replicate = rep(1:reps, each = size), + .before = dplyr::everything() + ) %>% + dplyr::group_by(replicate) +} - i <- unlist(replicate( - reps, - sample.int(n, size, replace = replace, prob = prob), - simplify = FALSE - )) - rep_tbl <- cbind( - replicate = rep(1:reps, rep(size, reps)), - tbl[i, ] - ) - rep_tbl <- tibble::as_tibble(rep_tbl) - names(rep_tbl)[-1] <- names(tbl) - dplyr::group_by(rep_tbl, replicate) +#' @rdname rep_sample_n +#' @export +rep_slice_sample <- function(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) { + rep_sample_n(tbl, n, replace, reps, weight_by) } diff --git a/man/get_confidence_interval.Rd b/man/get_confidence_interval.Rd index f534b35f..284b30a9 100644 --- a/man/get_confidence_interval.Rd +++ b/man/get_confidence_interval.Rd @@ -36,7 +36,7 @@ correspond to lower and upper bounds of the confidence interval. } \description{ Compute a confidence interval around a summary statistic. Only -simulation-based methods are (currently only) supported. +simulation-based methods are (currently) supported. Learn more in \code{vignette("infer")}. } diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 5607d1df..99c96c6f 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/rep_sample_n.R \name{rep_sample_n} \alias{rep_sample_n} +\alias{rep_slice_sample} \title{Perform repeated sampling} \usage{ rep_sample_n(tbl, size, replace = FALSE, reps = 1, prob = NULL) + +rep_slice_sample(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) } \arguments{ \item{tbl}{Data frame of population from which to sample.} @@ -26,6 +29,12 @@ samples of size n = \code{size} from \code{tbl}. Perform repeated sampling of samples of size n. Useful for creating sampling distributions. } +\details{ +The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function from that \code{rep_sample_n()} function +was originally written to supplement has been superseded +by \code{\link[dplyr:slice]{dplyr::slice_sample()}}. \code{rep_slice_sample()} provides a light wrapper +around \code{rep_sample_n()} that has a more similar interface to \code{slice_sample()} +} \examples{ suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(ggplot2)) @@ -50,5 +59,4 @@ ggplot(p_hats, aes(x = prop_hurricane)) + geom_density() + labs(x = "p_hat", y = "Number of samples", title = "Sampling distribution of p_hat from 1000 samples of size 50") - } diff --git a/tests/testthat/test-rep_sample_n.R b/tests/testthat/test-rep_sample_n.R index 9bdbc4b0..0cf860fb 100644 --- a/tests/testthat/test-rep_sample_n.R +++ b/tests/testthat/test-rep_sample_n.R @@ -2,25 +2,26 @@ context("rep_sample_n") N <- 5 population <- tibble::tibble( - ball_ID = 1:N, + ball_id = 1:N, color = factor(c(rep("red", 3), rep("white", N - 3))) ) test_that("rep_sample_n works", { - expect_silent(population %>% rep_sample_n(size = 2, reps = 10)) + expect_silent(test_rep <- population %>% rep_sample_n(size = 2, reps = 10)) + expect_error( population %>% rep_sample_n(size = 2, reps = 10, prob = rep(x = 1/5, times = 100)) ) + expect_error( population %>% rep_sample_n(size = 2, reps = 10, prob = c(1/2, 1/2)) ) - expect_error( - population %>% - rep_sample_n(size = 2, reps = 10, prob = c(0.25, 1/5, 1/5, 1/5, 0.15)) - ) - test_rep <- population %>% rep_sample_n(size = 2, reps = 10) expect_equal(c("replicate", names(population)), names(test_rep)) }) + +test_that("rep_slice_sample works", { + +}) From 5882a2be8c4d667ec2dfde9666712025582ef1a1 Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 15:02:05 -0700 Subject: [PATCH 02/13] add unit testing, adjust docs --- R/rep_sample_n.R | 50 ++++++++++--------- man/rep_sample_n.Rd | 50 ++++++++++--------- tests/testthat/test-rep_sample_n.R | 79 ++++++++++++++++++++++++++++-- 3 files changed, 129 insertions(+), 50 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index e0f6557b..4edffd78 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -2,48 +2,50 @@ #' #' @description #' -#' Perform repeated sampling of samples of size n. Useful for creating sampling -#' distributions. +#' These functions extend the functionality of [dplyr::sample_n()] and +#' [dplyr::slice_sample()] by allowing for repeated sampling of data. +#' This operation is especially helpful while creating sampling +#' distributions—see the examples below! #' #' @param tbl Data frame of population from which to sample. #' @param size Sample size of each sample. #' @param replace Should sampling be with replacement? #' @param reps Number of samples of size n = `size` to take. -#' @param prob A vector of probability weights for obtaining the elements of the -#' vector being sampled. +#' @param prob A vector of sampling weights for each of the rows in `tbl`—must +#' have length equal to `nrow(tbl)`. +#' @param n Sample size of each sample. +#' @param weight_by A vector of sampling weights for each of the rows +#' in `tbl`—must have length equal to `nrow(tbl)`. #' -#' @return A tibble of size `rep` times `size` rows corresponding to `rep` -#' samples of size n = `size` from `tbl`. +#' @return A tibble of size `rep * size` rows corresponding to `reps` +#' samples of size `size` from `tbl`, grouped by `replicate`. #' #' @details The [dplyr::sample_n()] function from that `rep_sample_n()` function #' was originally written to supplement has been superseded #' by [dplyr::slice_sample()]. `rep_slice_sample()` provides a light wrapper -#' around `rep_sample_n()` that has a more similar interface to `slice_sample()` +#' around `rep_sample_n()` that has a more similar interface to `slice_sample()`. #' #' @examples -#' suppressPackageStartupMessages(library(dplyr)) -#' suppressPackageStartupMessages(library(ggplot2)) -#' -#' # A virtual population of N = 10,010, of which 3091 are hurricanes -#' population <- dplyr::storms %>% -#' select(status) -#' -#' # Take samples of size n = 50 storms without replacement; do this 1000 times -#' samples <- population %>% +#' library(dplyr) +#' library(ggplot2) +#' +#' # take 1000 samples of size n = 50, without replacement +#' resamples <- gss %>% #' rep_sample_n(size = 50, reps = 1000) -#' samples +#' +#' resamples #' -#' # Compute p_hats for all 1000 samples = proportion hurricanes -#' p_hats <- samples %>% +#' # compute the proportion of respondents with a college +#' # degree in each replicate +#' p_hats <- resamples %>% #' group_by(replicate) %>% -#' summarize(prop_hurricane = mean(status == "hurricane")) -#' p_hats +#' summarize(prop_college = mean(college == "degree")) #' -#' # Plot sampling distribution -#' ggplot(p_hats, aes(x = prop_hurricane)) + +#' # plot sampling distribution +#' ggplot(p_hats, aes(x = prop_college)) + #' geom_density() + #' labs(x = "p_hat", y = "Number of samples", -#' title = "Sampling distribution of p_hat from 1000 samples of size 50") +#' title = "Sampling distribution of p_hat") #' @export rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(tbl, is.data.frame) diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 99c96c6f..460493f6 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -18,45 +18,49 @@ rep_slice_sample(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) \item{reps}{Number of samples of size n = \code{size} to take.} -\item{prob}{A vector of probability weights for obtaining the elements of the -vector being sampled.} +\item{prob}{A vector of sampling weights for each of the rows in \code{tbl}—must +have length equal to \code{nrow(tbl)}.} + +\item{n}{Sample size of each sample.} + +\item{weight_by}{A vector of sampling weights for each of the rows +in \code{tbl}—must have length equal to \code{nrow(tbl)}.} } \value{ -A tibble of size \code{rep} times \code{size} rows corresponding to \code{rep} -samples of size n = \code{size} from \code{tbl}. +A tibble of size \code{rep * size} rows corresponding to \code{reps} +samples of size \code{size} from \code{tbl}, grouped by \code{replicate}. } \description{ -Perform repeated sampling of samples of size n. Useful for creating sampling -distributions. +These functions extend the functionality of \code{\link[dplyr:sample_n]{dplyr::sample_n()}} and +\code{\link[dplyr:slice]{dplyr::slice_sample()}} by allowing for repeated sampling of data. +This operation is especially helpful while creating sampling +distributions—see the examples below! } \details{ The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function from that \code{rep_sample_n()} function was originally written to supplement has been superseded by \code{\link[dplyr:slice]{dplyr::slice_sample()}}. \code{rep_slice_sample()} provides a light wrapper -around \code{rep_sample_n()} that has a more similar interface to \code{slice_sample()} +around \code{rep_sample_n()} that has a more similar interface to \code{slice_sample()}. } \examples{ -suppressPackageStartupMessages(library(dplyr)) -suppressPackageStartupMessages(library(ggplot2)) - -# A virtual population of N = 10,010, of which 3091 are hurricanes -population <- dplyr::storms \%>\% - select(status) +library(dplyr) +library(ggplot2) -# Take samples of size n = 50 storms without replacement; do this 1000 times -samples <- population \%>\% +# take 1000 samples of size n = 50, without replacement +resamples <- gss \%>\% rep_sample_n(size = 50, reps = 1000) -samples + +resamples -# Compute p_hats for all 1000 samples = proportion hurricanes -p_hats <- samples \%>\% +# compute the proportion of respondents with a college +# degree in each replicate +p_hats <- resamples \%>\% group_by(replicate) \%>\% - summarize(prop_hurricane = mean(status == "hurricane")) -p_hats + summarize(prop_college = mean(college == "degree")) -# Plot sampling distribution -ggplot(p_hats, aes(x = prop_hurricane)) + +# plot sampling distribution +ggplot(p_hats, aes(x = prop_college)) + geom_density() + labs(x = "p_hat", y = "Number of samples", - title = "Sampling distribution of p_hat from 1000 samples of size 50") + title = "Sampling distribution of p_hat") } diff --git a/tests/testthat/test-rep_sample_n.R b/tests/testthat/test-rep_sample_n.R index 0cf860fb..fb02d145 100644 --- a/tests/testthat/test-rep_sample_n.R +++ b/tests/testthat/test-rep_sample_n.R @@ -6,9 +6,68 @@ population <- tibble::tibble( color = factor(c(rep("red", 3), rep("white", N - 3))) ) -test_that("rep_sample_n works", { - expect_silent(test_rep <- population %>% rep_sample_n(size = 2, reps = 10)) +test_that("rep_sample_n is sensitive to the size argument", { + set.seed(1) + reps <- 10 + s1 <- 2 + s2 <- 3 + res1 <- population %>% rep_sample_n(size = s1, reps = reps) + res2 <- population %>% rep_sample_n(size = s2, reps = reps) + + expect_equal(ncol(res1), ncol(res2)) + expect_equal(ncol(res1), 3) + + expect_equal(nrow(res1) / s1, nrow(res2) / s2) + expect_equal(nrow(res1), reps * s1) +}) + +test_that("rep_sample_n is sensitive to the reps argument", { + set.seed(1) + r1 <- 10 + r2 <- 5 + size <- 2 + + res1 <- population %>% rep_sample_n(size = size, reps = r1) + res2 <- population %>% rep_sample_n(size = size, reps = r2) + + expect_equal(ncol(res1), ncol(res2)) + expect_equal(ncol(res1), 3) + + expect_equal(nrow(res1) / r1, nrow(res2) / r2) + expect_equal(nrow(res1), reps * s1) +}) + +test_that("rep_sample_n is sensitive to the replace argument", { + set.seed(1) + res1 <- population %>% rep_sample_n(size = 5, reps = 100, replace = TRUE) + + set.seed(1) + res2 <- population %>% rep_sample_n(size = 5, reps = 100, replace = FALSE) + + expect_true(all(res1$replicate == res2$replicate)) + expect_false(all(res1$ball_id == res2$ball_id)) + expect_false(all(res1$color == res2$color)) + + expect_equal(ncol(res1), ncol(res2)) + expect_equal(ncol(res1), 3) +}) + +test_that("rep_sample_n is sensitive to the prob argument", { + set.seed(1) + res1 <- population %>% + rep_sample_n( + size = 5, + reps = 100, + replace = TRUE, + prob = c(1, rep(0, 4)) + ) + + expect_true(all(res1$ball_id == 1)) + expect_true(all(res1$color == "red")) +}) + +test_that("rep_sample_n errors with bad arguments", { expect_error( population %>% rep_sample_n(size = 2, reps = 10, prob = rep(x = 1/5, times = 100)) @@ -19,9 +78,23 @@ test_that("rep_sample_n works", { rep_sample_n(size = 2, reps = 10, prob = c(1/2, 1/2)) ) - expect_equal(c("replicate", names(population)), names(test_rep)) + expect_error( + population %>% + rep_sample_n(size = "a lot", reps = 10) + ) + + expect_error( + population %>% + rep_sample_n(size = 2, reps = "a lot") + ) }) test_that("rep_slice_sample works", { + set.seed(1) + res1 <- rep_sample_n(population, size = 2, reps = 5, prob = rep(1/N, N)) + + set.seed(1) + res2 <- rep_slice_sample(population, n = 2, reps = 5, weight_by = rep(1/N, N)) + expect_equal(res1, res2) }) From 96048bf7ce9876422a1c528e9a6ffed9fe3d9cbc Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 16:23:02 -0700 Subject: [PATCH 03/13] fix test failure --- tests/testthat/test-rep_sample_n.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-rep_sample_n.R b/tests/testthat/test-rep_sample_n.R index fb02d145..cab65d51 100644 --- a/tests/testthat/test-rep_sample_n.R +++ b/tests/testthat/test-rep_sample_n.R @@ -35,7 +35,7 @@ test_that("rep_sample_n is sensitive to the reps argument", { expect_equal(ncol(res1), 3) expect_equal(nrow(res1) / r1, nrow(res2) / r2) - expect_equal(nrow(res1), reps * s1) + expect_equal(nrow(res1), r1 * size) }) test_that("rep_sample_n is sensitive to the replace argument", { From bd7e72ff0c8348e66be07e065ec31a4d4d2e2bfa Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 16:52:19 -0700 Subject: [PATCH 04/13] rename/order arguments re: dplyr::slice_sample() --- R/rep_sample_n.R | 5 +++-- man/rep_sample_n.Rd | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 4edffd78..0ebcd9f2 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -13,6 +13,7 @@ #' @param reps Number of samples of size n = `size` to take. #' @param prob A vector of sampling weights for each of the rows in `tbl`—must #' have length equal to `nrow(tbl)`. +#' @param .data Data frame of population from which to sample. #' @param n Sample size of each sample. #' @param weight_by A vector of sampling weights for each of the rows #' in `tbl`—must have length equal to `nrow(tbl)`. @@ -70,6 +71,6 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { #' @rdname rep_sample_n #' @export -rep_slice_sample <- function(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) { - rep_sample_n(tbl, n, replace, reps, weight_by) +rep_slice_sample <- function(.data, n = 1, replace = FALSE, weight_by = NULL, reps = 1) { + rep_sample_n(.data, n, replace, reps, weight_by) } diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 460493f6..7c7cad32 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -7,7 +7,7 @@ \usage{ rep_sample_n(tbl, size, replace = FALSE, reps = 1, prob = NULL) -rep_slice_sample(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) +rep_slice_sample(.data, n = 1, replace = FALSE, weight_by = NULL, reps = 1) } \arguments{ \item{tbl}{Data frame of population from which to sample.} @@ -21,6 +21,8 @@ rep_slice_sample(tbl, n, replace = FALSE, reps = 1, weight_by = NULL) \item{prob}{A vector of sampling weights for each of the rows in \code{tbl}—must have length equal to \code{nrow(tbl)}.} +\item{.data}{Data frame of population from which to sample.} + \item{n}{Sample size of each sample.} \item{weight_by}{A vector of sampling weights for each of the rows From 149bb132079881cfb206e2d5d393619d92432a7d Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 17:06:57 -0700 Subject: [PATCH 05/13] update NEWS --- NEWS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 96772352..76aab3a2 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,9 @@ # infer 0.5.3.9000 (development version) -To be released as 0.5.4. +- `rep_sample_n()` no longer errors when supplied a `prob` argument (#279) +- Added `rep_slice_sample()`, a light wrapper around `rep_sample_n()`, that +more closely resembles `dplyr::slice_sample()` (the function that supersedes) +`dplyr::sample_n()` (#325) # infer 0.5.3 From ede7bcc7593cf0eee8943d58f77e757970c1c3ec Mon Sep 17 00:00:00 2001 From: "Simon P. Couch" Date: Fri, 24 Jul 2020 19:21:49 -0700 Subject: [PATCH 06/13] rename resamples -> slices, adjust docs document argument aliases in the same param rather than duplicating descriptions --- R/rep_sample_n.R | 16 ++++++---------- man/rep_sample_n.Rd | 19 ++++++------------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 0ebcd9f2..e5c18f64 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -7,16 +7,12 @@ #' This operation is especially helpful while creating sampling #' distributions—see the examples below! #' -#' @param tbl Data frame of population from which to sample. -#' @param size Sample size of each sample. +#' @param tbl,.data Data frame of population from which to sample. +#' @param size,n Sample size of each sample. #' @param replace Should sampling be with replacement? #' @param reps Number of samples of size n = `size` to take. -#' @param prob A vector of sampling weights for each of the rows in `tbl`—must +#' @param prob,weight_by A vector of sampling weights for each of the rows in `tbl`—must #' have length equal to `nrow(tbl)`. -#' @param .data Data frame of population from which to sample. -#' @param n Sample size of each sample. -#' @param weight_by A vector of sampling weights for each of the rows -#' in `tbl`—must have length equal to `nrow(tbl)`. #' #' @return A tibble of size `rep * size` rows corresponding to `reps` #' samples of size `size` from `tbl`, grouped by `replicate`. @@ -31,14 +27,14 @@ #' library(ggplot2) #' #' # take 1000 samples of size n = 50, without replacement -#' resamples <- gss %>% +#' slices <- gss %>% #' rep_sample_n(size = 50, reps = 1000) #' -#' resamples +#' slices #' #' # compute the proportion of respondents with a college #' # degree in each replicate -#' p_hats <- resamples %>% +#' p_hats <- slices %>% #' group_by(replicate) %>% #' summarize(prop_college = mean(college == "degree")) #' diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 7c7cad32..752f5d66 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -10,23 +10,16 @@ rep_sample_n(tbl, size, replace = FALSE, reps = 1, prob = NULL) rep_slice_sample(.data, n = 1, replace = FALSE, weight_by = NULL, reps = 1) } \arguments{ -\item{tbl}{Data frame of population from which to sample.} +\item{tbl, .data}{Data frame of population from which to sample.} -\item{size}{Sample size of each sample.} +\item{size, n}{Sample size of each sample.} \item{replace}{Should sampling be with replacement?} \item{reps}{Number of samples of size n = \code{size} to take.} -\item{prob}{A vector of sampling weights for each of the rows in \code{tbl}—must +\item{prob, weight_by}{A vector of sampling weights for each of the rows in \code{tbl}—must have length equal to \code{nrow(tbl)}.} - -\item{.data}{Data frame of population from which to sample.} - -\item{n}{Sample size of each sample.} - -\item{weight_by}{A vector of sampling weights for each of the rows -in \code{tbl}—must have length equal to \code{nrow(tbl)}.} } \value{ A tibble of size \code{rep * size} rows corresponding to \code{reps} @@ -49,14 +42,14 @@ library(dplyr) library(ggplot2) # take 1000 samples of size n = 50, without replacement -resamples <- gss \%>\% +slices <- gss \%>\% rep_sample_n(size = 50, reps = 1000) -resamples +slices # compute the proportion of respondents with a college # degree in each replicate -p_hats <- resamples \%>\% +p_hats <- slices \%>\% group_by(replicate) \%>\% summarize(prop_college = mean(college == "degree")) From 6856cc47bc65e08cf2d1a8253238894fa98ef9d7 Mon Sep 17 00:00:00 2001 From: echasnovski Date: Sat, 25 Jul 2020 12:40:15 +0300 Subject: [PATCH 07/13] Lint 'rep_sample_n.R'. --- R/rep_sample_n.R | 36 ++++++++++++++++++++---------------- man/rep_sample_n.Rd | 23 +++++++++++++---------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index e5c18f64..d136150f 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -2,47 +2,50 @@ #' #' @description #' -#' These functions extend the functionality of [dplyr::sample_n()] and +#' These functions extend the functionality of [dplyr::sample_n()] and #' [dplyr::slice_sample()] by allowing for repeated sampling of data. -#' This operation is especially helpful while creating sampling +#' This operation is especially helpful while creating sampling #' distributions—see the examples below! #' #' @param tbl,.data Data frame of population from which to sample. #' @param size,n Sample size of each sample. #' @param replace Should sampling be with replacement? #' @param reps Number of samples of size n = `size` to take. -#' @param prob,weight_by A vector of sampling weights for each of the rows in `tbl`—must -#' have length equal to `nrow(tbl)`. +#' @param prob,weight_by A vector of sampling weights for each of the rows in +#' `tbl`—must have length equal to `nrow(tbl)`. #' #' @return A tibble of size `rep * size` rows corresponding to `reps` #' samples of size `size` from `tbl`, grouped by `replicate`. -#' -#' @details The [dplyr::sample_n()] function from that `rep_sample_n()` function -#' was originally written to supplement has been superseded -#' by [dplyr::slice_sample()]. `rep_slice_sample()` provides a light wrapper -#' around `rep_sample_n()` that has a more similar interface to `slice_sample()`. +#' +#' @details The [dplyr::sample_n()] function from that `rep_sample_n()` +#' function was originally written to supplement has been superseded by +#' [dplyr::slice_sample()]. `rep_slice_sample()` provides a light wrapper +#' around `rep_sample_n()` that has a more similar interface to +#' `slice_sample()`. #' #' @examples #' library(dplyr) #' library(ggplot2) -#' +#' #' # take 1000 samples of size n = 50, without replacement #' slices <- gss %>% #' rep_sample_n(size = 50, reps = 1000) -#' +#' #' slices #' #' # compute the proportion of respondents with a college #' # degree in each replicate #' p_hats <- slices %>% #' group_by(replicate) %>% -#' summarize(prop_college = mean(college == "degree")) +#' summarize(prop_college = mean(college == "degree")) #' #' # plot sampling distribution #' ggplot(p_hats, aes(x = prop_college)) + #' geom_density() + -#' labs(x = "p_hat", y = "Number of samples", -#' title = "Sampling distribution of p_hat") +#' labs( +#' x = "p_hat", y = "Number of samples", +#' title = "Sampling distribution of p_hat" +#' ) #' @export rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(tbl, is.data.frame) @@ -59,7 +62,7 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { dplyr::slice_sample(n = size, weight_by = prob, replace = replace) ) %>% dplyr::mutate( - replicate = rep(1:reps, each = size), + replicate = rep(1:reps, each = size), .before = dplyr::everything() ) %>% dplyr::group_by(replicate) @@ -67,6 +70,7 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { #' @rdname rep_sample_n #' @export -rep_slice_sample <- function(.data, n = 1, replace = FALSE, weight_by = NULL, reps = 1) { +rep_slice_sample <- function(.data, n = 1, replace = FALSE, weight_by = NULL, + reps = 1) { rep_sample_n(.data, n, replace, reps, weight_by) } diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 752f5d66..7b2d9a5a 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -18,8 +18,8 @@ rep_slice_sample(.data, n = 1, replace = FALSE, weight_by = NULL, reps = 1) \item{reps}{Number of samples of size n = \code{size} to take.} -\item{prob, weight_by}{A vector of sampling weights for each of the rows in \code{tbl}—must -have length equal to \code{nrow(tbl)}.} +\item{prob, weight_by}{A vector of sampling weights for each of the rows in +\code{tbl}—must have length equal to \code{nrow(tbl)}.} } \value{ A tibble of size \code{rep * size} rows corresponding to \code{reps} @@ -32,10 +32,11 @@ This operation is especially helpful while creating sampling distributions—see the examples below! } \details{ -The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function from that \code{rep_sample_n()} function -was originally written to supplement has been superseded -by \code{\link[dplyr:slice]{dplyr::slice_sample()}}. \code{rep_slice_sample()} provides a light wrapper -around \code{rep_sample_n()} that has a more similar interface to \code{slice_sample()}. +The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function from that \code{rep_sample_n()} +function was originally written to supplement has been superseded by +\code{\link[dplyr:slice]{dplyr::slice_sample()}}. \code{rep_slice_sample()} provides a light wrapper +around \code{rep_sample_n()} that has a more similar interface to +\code{slice_sample()}. } \examples{ library(dplyr) @@ -44,18 +45,20 @@ library(ggplot2) # take 1000 samples of size n = 50, without replacement slices <- gss \%>\% rep_sample_n(size = 50, reps = 1000) - + slices # compute the proportion of respondents with a college # degree in each replicate p_hats <- slices \%>\% group_by(replicate) \%>\% - summarize(prop_college = mean(college == "degree")) + summarize(prop_college = mean(college == "degree")) # plot sampling distribution ggplot(p_hats, aes(x = prop_college)) + geom_density() + - labs(x = "p_hat", y = "Number of samples", - title = "Sampling distribution of p_hat") + labs( + x = "p_hat", y = "Number of samples", + title = "Sampling distribution of p_hat" + ) } From 7a13f51fd3b62ce7743472f320ea81d0b6d5ec45 Mon Sep 17 00:00:00 2001 From: echasnovski Date: Sat, 25 Jul 2020 13:48:14 +0300 Subject: [PATCH 08/13] Update/revert `rep_sample_n()` implementation. --- R/rep_sample_n.R | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index d136150f..e11a83f3 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -54,17 +54,27 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(reps, is.numeric) if (!is.null(prob)) { check_type(prob, is.numeric) + if (length(prob) != nrow(tbl)) { + stop_glue( + "The argument `prob` must have length `nrow(tbl)` = {nrow(tbl)}" + ) + } } - 1:reps %>% - purrr::map_dfr( - ~ tbl %>% - dplyr::slice_sample(n = size, weight_by = prob, replace = replace) - ) %>% - dplyr::mutate( - replicate = rep(1:reps, each = size), - .before = dplyr::everything() - ) %>% + # Generate row indexes for every future replicate (this way it respects + # possibility of `replace = FALSE`) + n <- nrow(tbl) + i <- unlist(replicate( + reps, + sample.int(n, size, replace = replace, prob = prob), + simplify = FALSE + )) + + tbl %>% + dplyr::slice(i) %>% + dplyr::mutate(replicate = rep(seq_len(reps), each = size)) %>% + dplyr::select(replicate, dplyr::everything()) %>% + tibble::as_tibble() %>% dplyr::group_by(replicate) } From 7b376b0de6857424be4457019ca4fb52d5e95d91 Mon Sep 17 00:00:00 2001 From: echasnovski Date: Sat, 25 Jul 2020 14:01:24 +0300 Subject: [PATCH 09/13] Update `rep_sample_n()` tests. --- tests/testthat/test-rep_sample_n.R | 46 ++++++++++++++++-------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/tests/testthat/test-rep_sample_n.R b/tests/testthat/test-rep_sample_n.R index cab65d51..a79bce0f 100644 --- a/tests/testthat/test-rep_sample_n.R +++ b/tests/testthat/test-rep_sample_n.R @@ -11,13 +11,13 @@ test_that("rep_sample_n is sensitive to the size argument", { reps <- 10 s1 <- 2 s2 <- 3 - + res1 <- population %>% rep_sample_n(size = s1, reps = reps) res2 <- population %>% rep_sample_n(size = s2, reps = reps) expect_equal(ncol(res1), ncol(res2)) expect_equal(ncol(res1), 3) - + expect_equal(nrow(res1) / s1, nrow(res2) / s2) expect_equal(nrow(res1), reps * s1) }) @@ -27,13 +27,13 @@ test_that("rep_sample_n is sensitive to the reps argument", { r1 <- 10 r2 <- 5 size <- 2 - + res1 <- population %>% rep_sample_n(size = size, reps = r1) res2 <- population %>% rep_sample_n(size = size, reps = r2) - + expect_equal(ncol(res1), ncol(res2)) expect_equal(ncol(res1), 3) - + expect_equal(nrow(res1) / r1, nrow(res2) / r2) expect_equal(nrow(res1), r1 * size) }) @@ -41,28 +41,32 @@ test_that("rep_sample_n is sensitive to the reps argument", { test_that("rep_sample_n is sensitive to the replace argument", { set.seed(1) res1 <- population %>% rep_sample_n(size = 5, reps = 100, replace = TRUE) - + set.seed(1) res2 <- population %>% rep_sample_n(size = 5, reps = 100, replace = FALSE) - + expect_true(all(res1$replicate == res2$replicate)) expect_false(all(res1$ball_id == res2$ball_id)) expect_false(all(res1$color == res2$color)) - + expect_equal(ncol(res1), ncol(res2)) expect_equal(ncol(res1), 3) + + # Check if there are actually no duplicates in case `replace = FALSE` + no_duplicates <- all(tapply(res2$ball_id, res2$replicate, anyDuplicated) == 0) + expect_true(no_duplicates) }) test_that("rep_sample_n is sensitive to the prob argument", { set.seed(1) - res1 <- population %>% + res1 <- population %>% rep_sample_n( - size = 5, - reps = 100, + size = 5, + reps = 100, replace = TRUE, prob = c(1, rep(0, 4)) ) - + expect_true(all(res1$ball_id == 1)) expect_true(all(res1$color == "red")) }) @@ -70,19 +74,19 @@ test_that("rep_sample_n is sensitive to the prob argument", { test_that("rep_sample_n errors with bad arguments", { expect_error( population %>% - rep_sample_n(size = 2, reps = 10, prob = rep(x = 1/5, times = 100)) + rep_sample_n(size = 2, reps = 10, prob = rep(x = 1 / 5, times = 100)) ) - + expect_error( population %>% - rep_sample_n(size = 2, reps = 10, prob = c(1/2, 1/2)) + rep_sample_n(size = 2, reps = 10, prob = c(1 / 2, 1 / 2)) ) - + expect_error( population %>% rep_sample_n(size = "a lot", reps = 10) ) - + expect_error( population %>% rep_sample_n(size = 2, reps = "a lot") @@ -91,10 +95,10 @@ test_that("rep_sample_n errors with bad arguments", { test_that("rep_slice_sample works", { set.seed(1) - res1 <- rep_sample_n(population, size = 2, reps = 5, prob = rep(1/N, N)) - + res1 <- rep_sample_n(population, size = 2, reps = 5, prob = rep(1 / N, N)) + set.seed(1) - res2 <- rep_slice_sample(population, n = 2, reps = 5, weight_by = rep(1/N, N)) - + res2 <- rep_slice_sample(population, n = 2, reps = 5, weight_by = rep(1 / N, N)) + expect_equal(res1, res2) }) From 8519b79fe786344339e13fbe0e05aec47c27c235 Mon Sep 17 00:00:00 2001 From: echasnovski Date: Sat, 25 Jul 2020 14:01:43 +0300 Subject: [PATCH 10/13] Tweak `rep_sample_n()` documentation. --- R/rep_sample_n.R | 9 ++++----- man/rep_sample_n.Rd | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index e11a83f3..ede89cb1 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -17,11 +17,10 @@ #' @return A tibble of size `rep * size` rows corresponding to `reps` #' samples of size `size` from `tbl`, grouped by `replicate`. #' -#' @details The [dplyr::sample_n()] function from that `rep_sample_n()` -#' function was originally written to supplement has been superseded by -#' [dplyr::slice_sample()]. `rep_slice_sample()` provides a light wrapper -#' around `rep_sample_n()` that has a more similar interface to -#' `slice_sample()`. +#' @details The [dplyr::sample_n()] function (to which `rep_sample_n()` was +#' originally a supplement) has been superseded by [dplyr::slice_sample()]. +#' `rep_slice_sample()` provides a light wrapper around `rep_sample_n()` that +#' has a more similar interface to `slice_sample()`. #' #' @examples #' library(dplyr) diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 7b2d9a5a..ee7f396d 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -32,11 +32,10 @@ This operation is especially helpful while creating sampling distributions—see the examples below! } \details{ -The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function from that \code{rep_sample_n()} -function was originally written to supplement has been superseded by -\code{\link[dplyr:slice]{dplyr::slice_sample()}}. \code{rep_slice_sample()} provides a light wrapper -around \code{rep_sample_n()} that has a more similar interface to -\code{slice_sample()}. +The \code{\link[dplyr:sample_n]{dplyr::sample_n()}} function (to which \code{rep_sample_n()} was +originally a supplement) has been superseded by \code{\link[dplyr:slice]{dplyr::slice_sample()}}. +\code{rep_slice_sample()} provides a light wrapper around \code{rep_sample_n()} that +has a more similar interface to \code{slice_sample()}. } \examples{ library(dplyr) From 58457a228d60146b50ee9bddd73386ba393fb088 Mon Sep 17 00:00:00 2001 From: rudeboybert Date: Sat, 25 Jul 2020 12:09:01 -0400 Subject: [PATCH 11/13] Added weighted sampling example to rep_sample_n() --- R/rep_sample_n.R | 8 ++++++++ man/rep_sample_n.Rd | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index ede89cb1..0ac05d84 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -45,6 +45,14 @@ #' x = "p_hat", y = "Number of samples", #' title = "Sampling distribution of p_hat" #' ) +#' +#' # sampling with probability weights. Note probabilities are automatically +#' # renormalized to sum to 1 +#' df <- tibble( +#' id = 1:5, +#' letter = factor(c("a", "b", "c", "d", "e")) +#' ) +#' rep_sample_n(df, size = 4, reps = 2, prob = c(.5, .4, .3, .2, .1)) #' @export rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(tbl, is.data.frame) diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index ee7f396d..f5dd7d52 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -60,4 +60,12 @@ ggplot(p_hats, aes(x = prop_college)) + x = "p_hat", y = "Number of samples", title = "Sampling distribution of p_hat" ) + +# sampling with probability weights. Note probabilities are automatically +# renormalized to sum to 1 +df <- tibble( + id = 1:5, + letter = factor(c("a", "b", "c", "d", "e")) +) +rep_sample_n(df, size = 4, reps = 2, prob = c(.5, .4, .3, .2, .1)) } From 1863946b2569afe787b45a6ad0a323f83193c222 Mon Sep 17 00:00:00 2001 From: rudeboybert Date: Sat, 25 Jul 2020 12:12:06 -0400 Subject: [PATCH 12/13] Added tibble pkg to ex --- R/rep_sample_n.R | 1 + man/rep_sample_n.Rd | 1 + 2 files changed, 2 insertions(+) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 0ac05d84..504debd1 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -48,6 +48,7 @@ #' #' # sampling with probability weights. Note probabilities are automatically #' # renormalized to sum to 1 +#' library(tibble) #' df <- tibble( #' id = 1:5, #' letter = factor(c("a", "b", "c", "d", "e")) diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index f5dd7d52..13499145 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -63,6 +63,7 @@ ggplot(p_hats, aes(x = prop_college)) + # sampling with probability weights. Note probabilities are automatically # renormalized to sum to 1 +library(tibble) df <- tibble( id = 1:5, letter = factor(c("a", "b", "c", "d", "e")) From 6e6c5121fa733f59535e2d6ee4282d2b40e122f4 Mon Sep 17 00:00:00 2001 From: rudeboybert Date: Sat, 25 Jul 2020 12:29:36 -0400 Subject: [PATCH 13/13] Lowered (w/o replacement) sample size & increased replicate number to make weighting more prominent --- R/rep_sample_n.R | 2 +- man/rep_sample_n.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 504debd1..ee7323de 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -53,7 +53,7 @@ #' id = 1:5, #' letter = factor(c("a", "b", "c", "d", "e")) #' ) -#' rep_sample_n(df, size = 4, reps = 2, prob = c(.5, .4, .3, .2, .1)) +#' rep_sample_n(df, size = 2, reps = 5, prob = c(.5, .4, .3, .2, .1)) #' @export rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(tbl, is.data.frame) diff --git a/man/rep_sample_n.Rd b/man/rep_sample_n.Rd index 13499145..99482370 100644 --- a/man/rep_sample_n.Rd +++ b/man/rep_sample_n.Rd @@ -68,5 +68,5 @@ df <- tibble( id = 1:5, letter = factor(c("a", "b", "c", "d", "e")) ) -rep_sample_n(df, size = 4, reps = 2, prob = c(.5, .4, .3, .2, .1)) +rep_sample_n(df, size = 2, reps = 5, prob = c(.5, .4, .3, .2, .1)) }