From 5759cbe29d0fad9871eaf6ad81ba2fdc81ed608a Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 01:47:41 -0500 Subject: [PATCH 1/8] power simulation lecture --- power_simulation.qmd | 84 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 power_simulation.qmd diff --git a/power_simulation.qmd b/power_simulation.qmd new file mode 100644 index 0000000..6440bdd --- /dev/null +++ b/power_simulation.qmd @@ -0,0 +1,84 @@ + +```{julia} +using DataFrames +using MixedModels +using MixedModelsMakie +using MixedModelsSim +using Random +using MixedModels: dataset +``` + +Let us consider the kb07 dataset. + +```{julia} +kb07 = dataset(:kb07) +contrasts = Dict(:spkr => EffectsCoding(), + :prec => EffectsCoding(), + :load => EffectsCoding()) +fm1 = fit(MixedModel, + @formula(rt_trunc ~ 1 * spkr * prec * load + (1|subj) + (1|item)), + kb07; contrasts) +``` +We can perform a *parametric bootstrap* on the model to get estimates of our uncertainty. +In the parametric bootstrap, we use the *parameters* we estimated to simulate new data. +If we repeat this process many times, we are able to "pick ourselves up by our bootstraps" +and examine the variability we would expect to see based purely on chance if the ground truth +exactly matched our estimates. +In this way, we are able to estimate our uncertainty -- we cannot be more certain than the 'natural' +variability we would have for a given parameter value. + +```{julia} +pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; optsum_overrides=(;ftol_rel=1e-8)) +``` + +Now, if we look at the docstring for `parametricbootstrap`, we see that there are keyword-arguments +for the various model parameters: +```{julia} +@doc parametricbootstrap` +``` + +```{julia} +subj_btwn = Dict(:age => ["old", "young"]) +item_btwn = Dict(:frequency => ["low", "high"]) +β = [250.0, -25.0, 10, 0.0] +σ = 25.0 +# relative to σ! +subj_re = create_re(2.0, 1.3) +item_re = create_re(1.3, 2.0) +``` + +```{julia} +# simulate!(simmod; β, σ, θ) +# datdat[!, :dv] = rand(MersenneTwister(12), [0, 1], nrow(datdat)) +coefpvalues = DataFrame() +rng = MersenneTwister(42) +@showprogress for subj_n in [20, 30, 50, 100] + for item_n in [40, 60, 100] + dat = simdat_crossed(rng, subj_n, item_n; + subj_btwn, item_btwn) + simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency + + (1 + frequency | subj) + + (1 + age | item)), dat) + + θ = createθ(simmod; subj=subj_re, item=item_re) + simboot = parametricbootstrap(rng, 1000, simmod; + β, σ, θ, + optsum_overrides=(;ftol_rel=1e-8), + progress=false) + df = DataFrame(simboot.coefpvalues) + df[!, :subj_n] .= subj_n + df[!, :item_n] .= item_n + append!(coefpvalues, df) + end +end +``` + +```{julia} +power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), + :p => (p -> mean(p .< 0.05)) => :power) +``` + +```{julia} +ridgeplot(simboot) +``` From 5ac0e3e61835f810ac0d23deb713993af55fd0a6 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 02:49:21 -0500 Subject: [PATCH 2/8] wip --- Project.toml | 2 ++ power_simulation.qmd | 86 +++++++++++++++++++++++++++++--------------- styles.css | 8 +++++ 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/Project.toml b/Project.toml index a2c7a6e..5f62727 100644 --- a/Project.toml +++ b/Project.toml @@ -26,6 +26,7 @@ Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" MixedModelsExtras = "781a26e1-49f4-409a-8f4c-c3159d78c17e" MixedModelsMakie = "b12ae82c-6730-437f-aff9-d2c38332a376" +MixedModelsSim = "d5ae56c5-23ca-4a1f-b505-9fc4796fc1fe" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" RCall = "6f49c342-dc21-5d91-9882-a32aef131414" RegressionFormulae = "545c379f-4ec2-4339-9aea-38f2fb6a8ba2" @@ -51,6 +52,7 @@ MacroTools = "0.5" MixedModels = "4.22" MixedModelsExtras = "1, 2" MixedModelsMakie = "0.3.26" +MixedModelsSim = "0.2.7" RCall = "0.13" RegressionFormulae = "0.1.3" StandardizedPredictors = "1" diff --git a/power_simulation.qmd b/power_simulation.qmd index 6440bdd..c949343 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -1,14 +1,29 @@ +--- +title: "Using simulation to estimate uncertainty and power" +subtitle : "Or how I learned how to stop worrying and love the bootstrap" +jupyter: julia-1.9 +format: + html: + embed-resources: true + css: styles.css +--- ```{julia} +#| output: false using DataFrames using MixedModels using MixedModelsMakie using MixedModelsSim +using ProgressMeter using Random + using MixedModels: dataset +ProgressMeter.ijulia_behavior(:clear) ``` -Let us consider the kb07 dataset. +# The Parametric Bootstrap + +Let us consider the `kb07` dataset. ```{julia} kb07 = dataset(:kb07) @@ -16,9 +31,12 @@ contrasts = Dict(:spkr => EffectsCoding(), :prec => EffectsCoding(), :load => EffectsCoding()) fm1 = fit(MixedModel, - @formula(rt_trunc ~ 1 * spkr * prec * load + (1|subj) + (1|item)), + @formula(rt_trunc ~ 1 * spkr * prec * load + + (1 | subj) + + (1 | item)), kb07; contrasts) ``` + We can perform a *parametric bootstrap* on the model to get estimates of our uncertainty. In the parametric bootstrap, we use the *parameters* we estimated to simulate new data. If we repeat this process many times, we are able to "pick ourselves up by our bootstraps" @@ -28,14 +46,24 @@ In this way, we are able to estimate our uncertainty -- we cannot be more certai variability we would have for a given parameter value. ```{julia} -pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; optsum_overrides=(;ftol_rel=1e-8)) +pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; + optsum_overrides=(;ftol_rel=1e-8)) ``` + +:::{.callout-tip collapse=true, title="`optsum_overrides`"} +The option `optsum_overrides` allows us to pass additional arguments for controlling the model fitting of each simulation replicate. +::: + Now, if we look at the docstring for `parametricbootstrap`, we see that there are keyword-arguments for the various model parameters: + +:::{.border id="docstring"} ```{julia} -@doc parametricbootstrap` +#| code-fold: true +@doc parametricbootstrap ``` +::: ```{julia} subj_btwn = Dict(:age => ["old", "young"]) @@ -50,35 +78,35 @@ item_re = create_re(1.3, 2.0) ```{julia} # simulate!(simmod; β, σ, θ) # datdat[!, :dv] = rand(MersenneTwister(12), [0, 1], nrow(datdat)) -coefpvalues = DataFrame() -rng = MersenneTwister(42) -@showprogress for subj_n in [20, 30, 50, 100] - for item_n in [40, 60, 100] - dat = simdat_crossed(rng, subj_n, item_n; - subj_btwn, item_btwn) - simmod = fit(MixedModel, - @formula(dv ~ 1 + age * frequency + - (1 + frequency | subj) + - (1 + age | item)), dat) - - θ = createθ(simmod; subj=subj_re, item=item_re) - simboot = parametricbootstrap(rng, 1000, simmod; - β, σ, θ, - optsum_overrides=(;ftol_rel=1e-8), - progress=false) - df = DataFrame(simboot.coefpvalues) - df[!, :subj_n] .= subj_n - df[!, :item_n] .= item_n - append!(coefpvalues, df) - end -end +# coefpvalues = DataFrame() +# rng = MersenneTwister(42) +# @showprogress for subj_n in [20, 30, 50, 100] +# for item_n in [40, 60, 100] +# dat = simdat_crossed(rng, subj_n, item_n; +# subj_btwn, item_btwn) +# simmod = fit(MixedModel, +# @formula(dv ~ 1 + age * frequency + +# (1 + frequency | subj) + +# (1 + age | item)), dat) + +# θ = createθ(simmod; subj=subj_re, item=item_re) +# simboot = parametricbootstrap(rng, 1000, simmod; +# β, σ, θ, +# optsum_overrides=(;ftol_rel=1e-8), +# progress=false) +# df = DataFrame(simboot.coefpvalues) +# df[!, :subj_n] .= subj_n +# df[!, :item_n] .= item_n +# append!(coefpvalues, df) +# end +# end ``` ```{julia} -power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), - :p => (p -> mean(p .< 0.05)) => :power) +# power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), +# :p => (p -> mean(p .< 0.05)) => :power) ``` ```{julia} -ridgeplot(simboot) +# ridgeplot(simboot) ``` diff --git a/styles.css b/styles.css index 2ddf50c..c21b6d5 100644 --- a/styles.css +++ b/styles.css @@ -1 +1,9 @@ /* css styles */ + +#docstring{ + border: solid black; + border-width: thick; + background-color: #f8f5f0; + margin-left: 5%; + margin-right: 5%; +} From a46a974dafe06239c694cbfaebdc130f07bf061e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 07:36:18 -0500 Subject: [PATCH 3/8] wip --- power_simulation.qmd | 57 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index c949343..14a8415 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -11,6 +11,7 @@ format: ```{julia} #| output: false using DataFrames +using Distributions using MixedModels using MixedModelsMakie using MixedModelsSim @@ -53,10 +54,10 @@ pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; :::{.callout-tip collapse=true, title="`optsum_overrides`"} The option `optsum_overrides` allows us to pass additional arguments for controlling the model fitting of each simulation replicate. +`ftol_rel=1e-8` lowers the threshold for changes in the objective -- more directly, the optimizer considers the model converged if the change in the deviance is less than $10^{-8}$, which is a very small change, but larger than the default $10^{-12}$. Because the majority of the optimization time is spent in final fine-tuning, changing this threshold can greatly speed up the fitting time at the cost of a small loss of quality in fit. For a stochastic process like the bootstrap, that change in quality just adds to the general noise, but that's acceptable tradeoff in order to get many more replicates. ::: -Now, if we look at the docstring for `parametricbootstrap`, we see that there are keyword-arguments -for the various model parameters: +Now, if we look at the docstring for `parametricbootstrap`, we see that there are keyword-arguments for the various model parameters: :::{.border id="docstring"} ```{julia} @@ -65,6 +66,58 @@ for the various model parameters: ``` ::: +These keyword arguments are forward on to the `simulate!` function, which simulates a new dataset based on model matrices and parameter values. +The model matrices are simply taken from the model at hand. +By default, the parameter values are the estimated parameter values from a fitted model. + +:::{.border id="docstring"} +```{julia} +#| code-fold: true +@doc simulate! +``` +::: + +So now we have a way to simulate new data with new parameter values once we have a model. +We just need a way to create a model with our preferred design. +We'll use the MixedModelsSim package for that. + +## Simulating data from scratch. + +The MixedModelsSim package provides a function `simdat_crossed` for simulating effects from a crossed design: + +:::{.border id="docstring"} +```{julia} +#| code-fold: true +@doc simdat_crossed +``` +::: + +Let's see what that looks like in practice. +We'll look at a simple 2 x 2 design with 20 subjects and 20 items. +Our first factor `age` will vary betwen subjects and have the levels `old` and `young`. +Our second factor `frequency` will vary between items and have the levels `low` and `high`. +Finally, we also need to specify a random number generator to use for seeding the data simulation. + +```{julia} +subj_n = 20 +item_n = 20 +subj_btwn = Dict(:age => ["old", "young"]) +item_btwn = Dict(:frequency => ["low", "high"]) +dat = simdat_crossed(MersenneTwister(42), subj_n, item_n; + subj_btwn, item_btwn) +Table(dat) +``` + +We have 400 rows -- 20 subjects x 20 items. +Similarly, the experimental factors are expanded out to be fully crossed. +Finally, we have a dependent variable `dv` initalized to be draws from the standard normal distribution $N(0,1)$. + +:::{.callout-note title="Latin squares, partial crossing, and continuous covariates"} +`simdat_crossed` is designed to simulate a fully crossed factorial design. +If you have a partially crossed or Latin squaresdesign, then you could delete the "extra" cells to reduce the fully crossed data here to be partially crossed. +::: + + ```{julia} subj_btwn = Dict(:age => ["old", "young"]) item_btwn = Dict(:frequency => ["low", "high"]) From fa49ae05975b8dfcf4653cefef478b05ab367f9e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 07:47:25 -0500 Subject: [PATCH 4/8] wip --- power_simulation.qmd | 49 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index 14a8415..5515708 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -10,6 +10,7 @@ format: ```{julia} #| output: false +using CairoMakie using DataFrames using Distributions using MixedModels @@ -103,7 +104,8 @@ subj_n = 20 item_n = 20 subj_btwn = Dict(:age => ["old", "young"]) item_btwn = Dict(:frequency => ["low", "high"]) -dat = simdat_crossed(MersenneTwister(42), subj_n, item_n; +const RNG = MersenneTwister(42) +dat = simdat_crossed(RNG, subj_n, item_n; subj_btwn, item_btwn) Table(dat) ``` @@ -115,21 +117,60 @@ Finally, we have a dependent variable `dv` initalized to be draws from the stand :::{.callout-note title="Latin squares, partial crossing, and continuous covariates"} `simdat_crossed` is designed to simulate a fully crossed factorial design. If you have a partially crossed or Latin squaresdesign, then you could delete the "extra" cells to reduce the fully crossed data here to be partially crossed. +For continuous covariates, we need to separately construct the covariates and then use a tabular join to create the design. +We'll examine an example of this later. ::: +```{julia} +simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency + + (1 + frequency | subj) + + (1 + age | item)), dat) +println(simmod) +``` + ```{julia} -subj_btwn = Dict(:age => ["old", "young"]) -item_btwn = Dict(:frequency => ["low", "high"]) β = [250.0, -25.0, 10, 0.0] +simulate!(RNG, simmod; β) +``` + + +```{julia} +fit!(simmod) +``` + +```{julia} σ = 25.0 +fit!(simulate!(RNG, simmod; β, σ)) +``` + +```{julia} # relative to σ! subj_re = create_re(2.0, 1.3) +``` + +```{julia} item_re = create_re(1.3, 2.0) ``` ```{julia} -# simulate!(simmod; β, σ, θ) +θ = createθ(simmod; subj=subj_re, item=item_re) +``` + +```{julia} +fit!(simulate!(RNG, simmod; β, σ, θ)) +``` + +```{julia} +samp = parametricbootstrap(RNG, 1000, simmod; β, σ, θ) +``` + +```{julia} +ridgeplot(samp) +``` + +```{julia} # datdat[!, :dv] = rand(MersenneTwister(12), [0, 1], nrow(datdat)) # coefpvalues = DataFrame() # rng = MersenneTwister(42) From ef1041c85dbd2514acb149fd23917c83b557f816 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 08:40:33 -0500 Subject: [PATCH 5/8] wip --- power_simulation.qmd | 84 +++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index 5515708..3fdf0d2 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -10,6 +10,7 @@ format: ```{julia} #| output: false +using AlgebraOfGraphics using CairoMakie using DataFrames using Distributions @@ -17,8 +18,10 @@ using MixedModels using MixedModelsMakie using MixedModelsSim using ProgressMeter +using StatsBase using Random +using AlgebraOfGraphics: AlgebraOfGraphics as AOG using MixedModels: dataset ProgressMeter.ijulia_behavior(:clear) ``` @@ -171,36 +174,67 @@ ridgeplot(samp) ``` ```{julia} -# datdat[!, :dv] = rand(MersenneTwister(12), [0, 1], nrow(datdat)) -# coefpvalues = DataFrame() -# rng = MersenneTwister(42) -# @showprogress for subj_n in [20, 30, 50, 100] -# for item_n in [40, 60, 100] -# dat = simdat_crossed(rng, subj_n, item_n; -# subj_btwn, item_btwn) -# simmod = fit(MixedModel, -# @formula(dv ~ 1 + age * frequency + -# (1 + frequency | subj) + -# (1 + age | item)), dat) +let f = Figure() + ax = Axis(f[1, 1]) + coefplot!(ax, samp; + conf_level=0.8, + vline_at_zero=true, + show_intercept=true) + ridgeplot!(ax, samp; + conf_level=0.8, + vline_at_zero=true, + show_intercept=true, + xlabel="Normalized density and 80% range") + scatter!(ax, β, length(β):-1:1; + marker=:x, + markersize=20, + color=:red) + f +end +``` + + +```{julia} +coefpvalues = DataFrame() +@showprogress for subj_n in [20, 30, 50, 100], item_n in [40, 60, 100] + dat = simdat_crossed(RNG, subj_n, item_n; + subj_btwn, item_btwn) + simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency + + (1 + frequency | subj) + + (1 + age | item)), dat) + + θ = createθ(simmod; subj=subj_re, item=item_re) + simboot = parametricbootstrap(RNG, 1000, simmod; + β, σ, θ, + optsum_overrides=(;ftol_rel=1e-8), + progress=false) + df = DataFrame(simboot.coefpvalues) + df[!, :subj_n] .= subj_n + df[!, :item_n] .= item_n + append!(coefpvalues, df) +end +``` + +```{julia} +power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), + :p => (p -> mean(p .< 0.05)) => :power) +``` + +```{julia} +power = combine(groupby(coefpvalues, + [:coefname, :subj_n, :item_n]), + :p => (p -> mean(p .< 0.05)) => :power, + :p => (p -> sem(p .< 0.05)) => :power_se) -# θ = createθ(simmod; subj=subj_re, item=item_re) -# simboot = parametricbootstrap(rng, 1000, simmod; -# β, σ, θ, -# optsum_overrides=(;ftol_rel=1e-8), -# progress=false) -# df = DataFrame(simboot.coefpvalues) -# df[!, :subj_n] .= subj_n -# df[!, :item_n] .= item_n -# append!(coefpvalues, df) -# end -# end ``` ```{julia} -# power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), -# :p => (p -> mean(p .< 0.05)) => :power) +select!(power, :coefname, :subj_n, :item_n, :power, + [:power, :power_se] => ByRow((p, se) -> [p - 1.96*se, p + 1.96*se]) => [:lower, :upper]) ``` + ```{julia} -# ridgeplot(simboot) +data(power) * mapping(:subj_n, :item_n, :power; layout=:coefname) * visual(Heatmap) |> draw ``` From 93026d468ce657c610ef2c55665c52cafe9f8998 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Fri, 15 Sep 2023 09:37:07 -0500 Subject: [PATCH 6/8] wip --- power_simulation.qmd | 56 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index 3fdf0d2..1220f74 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -132,6 +132,7 @@ simmod = fit(MixedModel, println(simmod) ``` +make sure to discuss contrasts here ```{julia} β = [250.0, -25.0, 10, 0.0] @@ -196,16 +197,16 @@ end ```{julia} coefpvalues = DataFrame() -@showprogress for subj_n in [20, 30, 50, 100], item_n in [40, 60, 100] +@showprogress for subj_n in [20, 40, 60, 80, 100, 120, 140], item_n in [40, 60, 80, 100, 120, 140] dat = simdat_crossed(RNG, subj_n, item_n; subj_btwn, item_btwn) simmod = fit(MixedModel, @formula(dv ~ 1 + age * frequency + (1 + frequency | subj) + - (1 + age | item)), dat) + (1 + age | item)), dat; progress=false) θ = createθ(simmod; subj=subj_re, item=item_re) - simboot = parametricbootstrap(RNG, 1000, simmod; + simboot = parametricbootstrap(RNG, 100, simmod; β, σ, θ, optsum_overrides=(;ftol_rel=1e-8), progress=false) @@ -238,3 +239,52 @@ select!(power, :coefname, :subj_n, :item_n, :power, ```{julia} data(power) * mapping(:subj_n, :item_n, :power; layout=:coefname) * visual(Heatmap) |> draw ``` + + +```{julia} +dat = simdat_crossed(RNG, subj_n, item_n; + subj_btwn, item_btwn) +dat = DataFrame(dat) +``` + +```{julia} +item_covariates = unique!(select(dat, :item)) +``` + + +```{julia} +item_covariates[!, :chaos] = rand(rng, + Normal(5, 2), + nrow(item_covariates)) +``` + +```{julia} +leftjoin!(dat, item_covariates; on=:item) +``` + +```{julia} +simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency * chaos + + (1 + frequency | subj) + + (1 + age | item)), dat; contrasts) +``` + +TODO: continuous covariate +TODO: bernoulli response +TODO: savereplicates + +```{julia} +dat[!, :dv] = rand(rng, Bernoulli(), nrow(dat)) +``` + +```{julia} +dat[!, :dv] = rand(rng, Poisson(), nrow(dat)) +``` + +```{julia} +dat[!, :n] = rand(rng, Poisson(), nrow(dat)) .+ 3 +``` + +```{julia} +dat[!, :dv] = rand.(rng, Binomial.(dat[!, :n])) ./ dat[!, :n] +``` From 3db3b706c0127a7b4ee199697d2028277ae2a8db Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Tue, 7 Nov 2023 09:46:51 -0600 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- power_simulation.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index 1220f74..538b26c 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -98,7 +98,7 @@ The MixedModelsSim package provides a function `simdat_crossed` for simulating e Let's see what that looks like in practice. We'll look at a simple 2 x 2 design with 20 subjects and 20 items. -Our first factor `age` will vary betwen subjects and have the levels `old` and `young`. +Our first factor `age` will vary between subjects and have the levels `old` and `young`. Our second factor `frequency` will vary between items and have the levels `low` and `high`. Finally, we also need to specify a random number generator to use for seeding the data simulation. @@ -115,7 +115,7 @@ Table(dat) We have 400 rows -- 20 subjects x 20 items. Similarly, the experimental factors are expanded out to be fully crossed. -Finally, we have a dependent variable `dv` initalized to be draws from the standard normal distribution $N(0,1)$. +Finally, we have a dependent variable `dv` initialized to be draws from the standard normal distribution $N(0,1)$. :::{.callout-note title="Latin squares, partial crossing, and continuous covariates"} `simdat_crossed` is designed to simulate a fully crossed factorial design. From 57d8f286fd846273ce4b94b002aeec4fdd6b34f0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 26 Jun 2024 22:17:25 -0500 Subject: [PATCH 8/8] formatting --- power_simulation.qmd | 106 +++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/power_simulation.qmd b/power_simulation.qmd index 538b26c..61bd87d 100644 --- a/power_simulation.qmd +++ b/power_simulation.qmd @@ -1,6 +1,6 @@ --- title: "Using simulation to estimate uncertainty and power" -subtitle : "Or how I learned how to stop worrying and love the bootstrap" +subtitle: "Or how I learned how to stop worrying and love the bootstrap" jupyter: julia-1.9 format: html: @@ -35,30 +35,30 @@ kb07 = dataset(:kb07) contrasts = Dict(:spkr => EffectsCoding(), :prec => EffectsCoding(), :load => EffectsCoding()) -fm1 = fit(MixedModel, - @formula(rt_trunc ~ 1 * spkr * prec * load + - (1 | subj) + +fm1 = fit(MixedModel, + @formula(rt_trunc ~ 1 * spkr * prec * load + + (1 | subj) + (1 | item)), kb07; contrasts) -``` +``` We can perform a *parametric bootstrap* on the model to get estimates of our uncertainty. -In the parametric bootstrap, we use the *parameters* we estimated to simulate new data. -If we repeat this process many times, we are able to "pick ourselves up by our bootstraps" -and examine the variability we would expect to see based purely on chance if the ground truth +In the parametric bootstrap, we use the *parameters* we estimated to simulate new data. +If we repeat this process many times, we are able to "pick ourselves up by our bootstraps" +and examine the variability we would expect to see based purely on chance if the ground truth exactly matched our estimates. In this way, we are able to estimate our uncertainty -- we cannot be more certain than the 'natural' variability we would have for a given parameter value. ```{julia} -pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; +pb1 = parametricbootstrap(MersenneTwister(42), 1000, fm1; optsum_overrides=(;ftol_rel=1e-8)) ``` :::{.callout-tip collapse=true, title="`optsum_overrides`"} The option `optsum_overrides` allows us to pass additional arguments for controlling the model fitting of each simulation replicate. -`ftol_rel=1e-8` lowers the threshold for changes in the objective -- more directly, the optimizer considers the model converged if the change in the deviance is less than $10^{-8}$, which is a very small change, but larger than the default $10^{-12}$. Because the majority of the optimization time is spent in final fine-tuning, changing this threshold can greatly speed up the fitting time at the cost of a small loss of quality in fit. For a stochastic process like the bootstrap, that change in quality just adds to the general noise, but that's acceptable tradeoff in order to get many more replicates. +`ftol_rel=1e-8` lowers the threshold for changes in the objective -- more directly, the optimizer considers the model converged if the change in the deviance is less than $10^{-8}$, which is a very small change, but larger than the default $10^{-12}$. Because the majority of the optimization time is spent in final fine-tuning, changing this threshold can greatly speed up the fitting time at the cost of a small loss of quality in fit. For a stochastic process like the bootstrap, that change in quality just adds to the general noise, but that's acceptable tradeoff in order to get many more replicates. ::: Now, if we look at the docstring for `parametricbootstrap`, we see that there are keyword-arguments for the various model parameters: @@ -70,9 +70,9 @@ Now, if we look at the docstring for `parametricbootstrap`, we see that there ar ``` ::: -These keyword arguments are forward on to the `simulate!` function, which simulates a new dataset based on model matrices and parameter values. -The model matrices are simply taken from the model at hand. -By default, the parameter values are the estimated parameter values from a fitted model. +These keyword arguments are forward on to the `simulate!` function, which simulates a new dataset based on model matrices and parameter values. +The model matrices are simply taken from the model at hand. +By default, the parameter values are the estimated parameter values from a fitted model. :::{.border id="docstring"} ```{julia} @@ -81,7 +81,7 @@ By default, the parameter values are the estimated parameter values from a fitte ``` ::: -So now we have a way to simulate new data with new parameter values once we have a model. +So now we have a way to simulate new data with new parameter values once we have a model. We just need a way to create a model with our preferred design. We'll use the MixedModelsSim package for that. @@ -96,7 +96,7 @@ The MixedModelsSim package provides a function `simdat_crossed` for simulating e ``` ::: -Let's see what that looks like in practice. +Let's see what that looks like in practice. We'll look at a simple 2 x 2 design with 20 subjects and 20 items. Our first factor `age` will vary between subjects and have the levels `old` and `young`. Our second factor `frequency` will vary between items and have the levels `low` and `high`. @@ -108,26 +108,26 @@ item_n = 20 subj_btwn = Dict(:age => ["old", "young"]) item_btwn = Dict(:frequency => ["low", "high"]) const RNG = MersenneTwister(42) -dat = simdat_crossed(RNG, subj_n, item_n; - subj_btwn, item_btwn) +dat = simdat_crossed(RNG, subj_n, item_n; + subj_btwn, item_btwn) Table(dat) ``` We have 400 rows -- 20 subjects x 20 items. -Similarly, the experimental factors are expanded out to be fully crossed. +Similarly, the experimental factors are expanded out to be fully crossed. Finally, we have a dependent variable `dv` initialized to be draws from the standard normal distribution $N(0,1)$. :::{.callout-note title="Latin squares, partial crossing, and continuous covariates"} -`simdat_crossed` is designed to simulate a fully crossed factorial design. +`simdat_crossed` is designed to simulate a fully crossed factorial design. If you have a partially crossed or Latin squaresdesign, then you could delete the "extra" cells to reduce the fully crossed data here to be partially crossed. -For continuous covariates, we need to separately construct the covariates and then use a tabular join to create the design. +For continuous covariates, we need to separately construct the covariates and then use a tabular join to create the design. We'll examine an example of this later. ::: ```{julia} -simmod = fit(MixedModel, - @formula(dv ~ 1 + age * frequency + - (1 + frequency | subj) + +simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency + + (1 + frequency | subj) + (1 + age | item)), dat) println(simmod) ``` @@ -178,16 +178,16 @@ ridgeplot(samp) let f = Figure() ax = Axis(f[1, 1]) coefplot!(ax, samp; - conf_level=0.8, - vline_at_zero=true, + conf_level=0.8, + vline_at_zero=true, show_intercept=true) - ridgeplot!(ax, samp; - conf_level=0.8, - vline_at_zero=true, + ridgeplot!(ax, samp; + conf_level=0.8, + vline_at_zero=true, show_intercept=true, xlabel="Normalized density and 80% range") - scatter!(ax, β, length(β):-1:1; - marker=:x, + scatter!(ax, β, length(β):-1:1; + marker=:x, markersize=20, color=:red) f @@ -198,32 +198,32 @@ end ```{julia} coefpvalues = DataFrame() @showprogress for subj_n in [20, 40, 60, 80, 100, 120, 140], item_n in [40, 60, 80, 100, 120, 140] - dat = simdat_crossed(RNG, subj_n, item_n; + dat = simdat_crossed(RNG, subj_n, item_n; subj_btwn, item_btwn) - simmod = fit(MixedModel, - @formula(dv ~ 1 + age * frequency + - (1 + frequency | subj) + - (1 + age | item)), dat; progress=false) + simmod = MixedModel(@formula(dv ~ 1 + age * frequency + + (1 + frequency | subj) + + (1 + age | item)), + dat) θ = createθ(simmod; subj=subj_re, item=item_re) - simboot = parametricbootstrap(RNG, 100, simmod; - β, σ, θ, + simboot = parametricbootstrap(RNG, 100, simmod; + β, σ, θ, optsum_overrides=(;ftol_rel=1e-8), progress=false) df = DataFrame(simboot.coefpvalues) df[!, :subj_n] .= subj_n - df[!, :item_n] .= item_n - append!(coefpvalues, df) + df[!, :item_n] .= item_n + append!(coefpvalues, df) end ``` ```{julia} -power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), +power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), :p => (p -> mean(p .< 0.05)) => :power) ``` ```{julia} -power = combine(groupby(coefpvalues, +power = combine(groupby(coefpvalues, [:coefname, :subj_n, :item_n]), :p => (p -> mean(p .< 0.05)) => :power, :p => (p -> sem(p .< 0.05)) => :power_se) @@ -242,9 +242,9 @@ data(power) * mapping(:subj_n, :item_n, :power; layout=:coefname) * visual(Heatm ```{julia} -dat = simdat_crossed(RNG, subj_n, item_n; +dat = simdat_crossed(RNG, subj_n, item_n; subj_btwn, item_btwn) -dat = DataFrame(dat) +dat = DataFrame(dat) ``` ```{julia} @@ -253,9 +253,9 @@ item_covariates = unique!(select(dat, :item)) ```{julia} -item_covariates[!, :chaos] = rand(rng, - Normal(5, 2), - nrow(item_covariates)) +item_covariates[!, :chaos] = rand(RNG, + Normal(5, 2), + nrow(item_covariates)) ``` ```{julia} @@ -263,9 +263,9 @@ leftjoin!(dat, item_covariates; on=:item) ``` ```{julia} -simmod = fit(MixedModel, - @formula(dv ~ 1 + age * frequency * chaos + - (1 + frequency | subj) + +simmod = fit(MixedModel, + @formula(dv ~ 1 + age * frequency * chaos + + (1 + frequency | subj) + (1 + age | item)), dat; contrasts) ``` @@ -274,17 +274,17 @@ TODO: bernoulli response TODO: savereplicates ```{julia} -dat[!, :dv] = rand(rng, Bernoulli(), nrow(dat)) +dat[!, :dv] = rand(RNG, Bernoulli(), nrow(dat)) ``` ```{julia} -dat[!, :dv] = rand(rng, Poisson(), nrow(dat)) +dat[!, :dv] = rand(RNG, Poisson(), nrow(dat)) ``` ```{julia} -dat[!, :n] = rand(rng, Poisson(), nrow(dat)) .+ 3 +dat[!, :n] = rand(RNG, Poisson(), nrow(dat)) .+ 3 ``` ```{julia} -dat[!, :dv] = rand.(rng, Binomial.(dat[!, :n])) ./ dat[!, :n] +dat[!, :dv] = rand.(RNG, Binomial.(dat[!, :n])) ./ dat[!, :n] ```