Skip to content

Commit

Permalink
Merge branch 'simulation' to incorporate reviewed changes of simulati…
Browse files Browse the repository at this point in the history
…on code for revision 1 of manuscript.
  • Loading branch information
e3bo committed Feb 26, 2018
2 parents 43cae75 + ccbd975 commit 33d0685
Show file tree
Hide file tree
Showing 56 changed files with 1,413 additions and 16 deletions.
1 change: 1 addition & 0 deletions simulation/run-scripts
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ docker run -v ${ddir}:${hdir}/data:ro \
--user docker -w ${hdir}/work \
eamon/2016feasibility:v20170829 /bin/bash -c "make -f ../src/Makefile"
chown -R $(id -u):$(id -g) ${wkdir}
outputdir=${wkdir} src/make-light.sh
58 changes: 47 additions & 11 deletions simulation/src/analyze_observations.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ RNGkind("L'Ecuyer-CMRG")
set.seed(1234)
parallel::mc.reset.stream()

load("checkpoint-02.rda")


levs <- list()
levs$bandwidth <- c(35, 100)
levs$lag <- 1
levs$lag <- c(1)
analysis_des_mat <- do.call(expand.grid, levs)

calc_auc <- function(predictions, is_null){
Expand Down Expand Up @@ -59,21 +59,55 @@ analyze_observations <- function(bandwidth, lag, obs){
list(sct=sct, auc=bs$t0, auc_stderr=bssd)
}

parallel_setup <- function(is_small, seed = 1){
### Due to the large size results of parallelized jobs, the SNOW-style
### cluster backend seems better than multicore approach to
### parallelization. With the cluster approach, we can fork before any
### results are computed to create workers rather than over time as
### results are generated. The forked processes share memory with the
### parent. So as the results accumulate and the parent's memory use
### grows, the memory allocated to each forked process also
### grows. Even though the forked processes don't need to modify the
### results, and thus the memory actually used by them is not great,
### the allocation seems to trigger some mechanism that kills jobs on
### the Olympus cluster. It could be the Linux OOM killer.
require(foreach)
if (is_small) {
clust <- parallel::makeForkCluster(nnodes = 2, outfile = "cluster-outfile")
doParallel::registerDoParallel(cl = clust, outfile = "cluster-outfile")
} else {
clust <- parallel::makeForkCluster(nnodes = 23)
doParallel::registerDoParallel(cl = clust)
}
parallel::clusterSetRNGStream(cl = clust, iseed = seed)
clust
}

is_small_scale <- Sys.getenv("is_small_scale") == "TRUE"
cluster <- parallel_setup(is_small = is_small_scale)

load("checkpoint-02.rda")
options <- list(preschedule = FALSE) # To keep results <2GB limit

soit <- iterators::iter(simulated_observations)

analyzed_observations <-
foreach (i=seq(1, nrow(process_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %:%
foreach (j=seq(1, nrow(observation_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %:%
foreach (m=seq(1, nrow(analysis_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %dopar% {
foreach (so = soit, .options.snow = options) %:%
foreach (so2 = iterators::iter(so), .options.snow = options) %:%
foreach (m = iterators::iter(analysis_des_mat, by = "row"),
.options.snow = options) %dopar% {
do.call(analyze_observations,
c(analysis_des_mat[m, ],
list(obs=simulated_observations[[i]][[j]])))
}
c(m, list(obs = so2)))
}
warnings()

parallel::stopCluster(cluster)

res <- list()
n <- 1



for (i in seq(1, nrow(process_des_mat))){
pvars <- process_des_mat[i, ]
for (j in seq(1, nrow(observation_des_mat))){
Expand All @@ -92,3 +126,5 @@ for (i in seq(1, nrow(process_des_mat))){
res <- do.call(rbind, res)

save.image(file="checkpoint-03.rda")

save(res, file="res.RData")
4 changes: 0 additions & 4 deletions simulation/src/make-light.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#!/usr/bin/env bash

cd $outputdir
../src/get-res.R
../src/get-cases.R
cd ..
lightname="${outputdir}-light"
zip -r $lightname ${outputdir}/ --exclude \*.rda
27 changes: 26 additions & 1 deletion simulation/src/simulate_cases.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set.seed(3234)
parallel::mc.reset.stream()

levs <- list()
levs$external_forcing <- c(1 / 7)
levs$external_forcing <- c(1 / 7, 1 / 30, 1 / 365)
levs$host_lifetime <- c(70 * 365)


Expand Down Expand Up @@ -56,3 +56,28 @@ simulated_procs <- foreach(i=seq(1, nrow(process_des_mat)),
do.call(sample_process, process_des_mat[i, ])

save.image(file="checkpoint-01.rda")

## Next we'll output a times series of cases for one of the figures in
## the paper. We'll delete observations that do not occur at multiples
## of ``snapshots`` to save space because they are not used in any
## further analysis

test_snap <- function(x, period){
mod <- x %% period
abs(mod) < .Machine$double.eps^.5
}

get_snaps <- function(df) {
tms <- df$time
snapshots <- c(7, 30)
tests <- lapply(snapshots, function(per) test_snap(tms, period = per))
is_snap <- colSums(do.call(rbind, tests)) > 0
df[is_snap, ]
}

for(i in seq_len(nrow(process_des_mat))){
simulated_procs[[i]]$test <- get_snaps(simulated_procs[[i]]$test)
simulated_procs[[i]]$null <- get_snaps(simulated_procs[[i]]$null)
}

save(simulated_procs, process_des_mat, file="cases.RData")
4 changes: 4 additions & 0 deletions simulation0201/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.git
./data/*
./src/*
./work*/*
4 changes: 4 additions & 0 deletions simulation0201/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*~
.Rhistory
work*
spaero_0.1.0*
48 changes: 48 additions & 0 deletions simulation0201/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
FROM r-base
MAINTAINER Eamon O'Dea <[last name without apostrophe]35@gmail.com>
RUN apt-get update && apt-get install -y -q --no-install-recommends \
openssh-server \
r-cran-coda \
r-cran-curl \
r-cran-desolve \
r-cran-doparallel \
r-cran-evaluate \
r-cran-foreach \
r-cran-formatr \
r-cran-ggplot2 \
r-cran-httr \
r-cran-igraph \
r-cran-iterators \
r-cran-jsonlite \
r-cran-lazyeval \
r-cran-memoise \
r-cran-mime \
r-cran-mvtnorm \
r-cran-nloptr \
r-cran-openssl \
r-cran-praise \
r-cran-rcpp \
r-cran-r6 \
r-cran-testthat \
r-cran-tgp \
r-cran-xtable \
r-cran-yaml
RUN install2.r --error \
earlywarnings \
knitr \
lintr \
np \
pomp \
rmarkdown
COPY ./spaero_0.2.0.1.tar.gz /home/docker/spaero_0.2.0.1.tar.gz
RUN /usr/bin/R CMD check --no-build-vignettes --no-manual /home/docker/spaero_*.tar.gz > /home/docker/check.so \
&& /usr/bin/R CMD INSTALL --build /home/docker/spaero_*.tar.gz
RUN mkdir /var/run/sshd && echo 'docker:screencast' | chpasswd \
&& sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]
17 changes: 17 additions & 0 deletions simulation0201/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
The ``simulation`` subdirectory contains:
1) simulation results used to generate the figures in the paper contained in the zip archive.
2) All scripts used to generate the simulation results in the directory ``src``.
3) Docker files (see below)
4) A copy of the R package spaero
5) Some additional non-essential but related scripts in ``src``.

To facilitate reproducibility, the scripts were run within a Docker container based on an image that is available on Docker Hub. The file ``build-image`` was used to create the image based on
the ``Dockerfile``. To reproduce the results on systems with Docker installed, run the bash script ``run-scripts`` the ``simulation`` subdirectory. This script will:

1) Pull the Docker container down from Docker Hub and create a time-stamped child directory
2) Run the Makefile in ``src`` to reproduce the full simulation results in the time-stamped child directory

Reproducing the full output is computationally intensive and not recommended on a desktop computer. The full output comes to about 30GB and includes many intermediate results which are not needed to recreate the figures. The script ``src/make-light.sh`` is included to generate a lighter compressed zip version of the results and only includes the data necessary for figures.

It is also possible to run the scripts without Docker, although one might have to consult the Docker image to determine the correct software versions to use. One important dependency, the spaero
R package for calculating the moving window statistics, is included in this subdirectory. The right sequence in which the scripts must be run to produce the results can be seen in ``src/Makefile``.
1 change: 1 addition & 0 deletions simulation0201/build-image
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
docker build -t eamon/2016feasibility .
19 changes: 19 additions & 0 deletions simulation0201/run-scripts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

set -e

topd=$(pwd)
tstamp=$(date +%Y-%m-%d_%H-%M-%S)
wkdir="${topd}/work-${tstamp}"
ddir="${topd}/data"
sdir="${topd}/src"
hdir="/home/docker"

mkdir ${wkdir}
chmod 757 ${wkdir}
docker run -v ${ddir}:${hdir}/data:ro \
-v ${sdir}:${hdir}/src:ro \
-v ${wkdir}:${hdir}/work \
--user docker -w ${hdir}/work \
eamon/2016feasibility:v20170829 /bin/bash -c "make -f ../src/Makefile"
chown -R $(id -u):$(id -g) ${wkdir}
Binary file added simulation0201/spaero_0.2.0.1.tar.gz
Binary file not shown.
16 changes: 16 additions & 0 deletions simulation0201/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
RS = '/usr/bin/Rscript'
WD = /home/docker/work
SD = /home/docker/src
DD = /home/docker/data
VPATH = $(SD):$(WD)

all: checkpoint-03.rda

checkpoint-03.rda: analyze_observations.R checkpoint-02.rda
$< >$(<F).so 2>$(<F).se

checkpoint-02.rda: simulate_observation.R checkpoint-01.rda
$< >$(<F).so 2>$(<F).se

checkpoint-01.rda: simulate_cases.R
$< >$(<F).so 2>$(<F).se
94 changes: 94 additions & 0 deletions simulation0201/src/analyze_observations.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/Rscript

library(foreach)

doParallel::registerDoParallel(cores=5)
RNGkind("L'Ecuyer-CMRG")
set.seed(1234)
parallel::mc.reset.stream()

load("checkpoint-02.rda")

levs <- list()
levs$bandwidth <- c(36, 156)
levs$lag <- 1
analysis_des_mat <- do.call(expand.grid, levs)

calc_auc <- function(predictions, is_null){
test <- !is.na(predictions)
predictions <- predictions[test]
is_null <- is_null[test]
r <- rank(predictions)
r1 <- sum(r[!is_null])
n1 <- sum(!is_null)
n2 <- sum(is_null)
(r1 - n1 * (n1 + 1) / 2) / (n1 * n2)
}

analyze_observations <- function(bandwidth, lag, obs){
get_tau <- function(stat_ts, time){
stats::cor(stat_ts, time, use="pairwise.complete.obs", method="kendall")
}
get_ews_cor <- function(reports, bw=bandwidth, l=lag){
stats <- spaero::get_stats(reports,
center_bandwidth=bw, stat_bandwidth=bw,
center_trend="local_constant",
stat_trend="local_constant",
center_kernel="uniform",
stat_kernel="uniform", lag=l)$stats
taus <- sapply(stats, get_tau, time=seq_along(reports))
list(stats=stats, taus=taus)
}
loop_manip <- function(x){
loop_omult <- function(x) lapply(x, get_ews_cor)
sc <- lapply(x, loop_omult)
taus <- lapply(sc, function(x) sapply(x, "[[", "taus"))
taus <- do.call(cbind, taus)
list(sc=sc, taus=taus)
}
sct <- lapply(obs, loop_manip)
is_null <- c(rep(FALSE, ncol(sct$test$taus)), rep(TRUE, ncol(sct$null$taus)))
preds <- cbind(sct$test$taus, sct$null$taus)

df <- data.frame(is_null, t(preds), row.names=1:ncol(preds))
samp_stat <- function(x, w) {
apply(x[w, -1], 2, calc_auc, is_null=df$is_null[w])
}
bs <- boot::boot(data=df, statistic=samp_stat, R=300)
bssd <- apply(bs$t, 2, sd, na.rm=TRUE)
list(sct=sct, auc=bs$t0, auc_stderr=bssd)
}

analyzed_observations <-
foreach (i=seq(1, nrow(process_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %:%
foreach (j=seq(1, nrow(observation_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %:%
foreach (m=seq(1, nrow(analysis_des_mat)),
.options.multicore=list(set.seed=TRUE, preschedule=FALSE)) %dopar% {
do.call(analyze_observations,
c(analysis_des_mat[m, ],
list(obs=simulated_observations[[i]][[j]])))
}
warnings()

res <- list()
n <- 1
for (i in seq(1, nrow(process_des_mat))){
pvars <- process_des_mat[i, ]
for (j in seq(1, nrow(observation_des_mat))){
ovars <- observation_des_mat[j, ]
for (m in seq(1, nrow(analysis_des_mat))){
auc <- analyzed_observations[[i]][[j]][[m]]$auc
names(auc) <- paste("AUC", names(auc), sep="_")
aucse <- analyzed_observations[[i]][[j]][[m]]$auc_stderr
names(aucse) <- paste(names(auc), "stderr", sep="_")
avars <- analysis_des_mat[m, ]
res[[n]] <- data.frame(c(pvars, ovars, avars, auc, aucse))
n <- n + 1
}
}
}
res <- do.call(rbind, res)

save.image(file="checkpoint-03.rda")
43 changes: 43 additions & 0 deletions simulation0201/src/extract-observations.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/Rscript

load("checkpoint-02.rda")
set.seed(1)

res <- list()
for (i in seq(1, nrow(process_des_mat))){
pvars <- process_des_mat[i, ]
jseq <- sample.int(nrow(observation_des_mat), 5) ## to reduce the size of the data
## for (j in seq(1, nrow(observation_des_mat))){
for (j in jseq){
ovars <- observation_des_mat[j, ]
sim_obs <- simulated_observations[[i]][[j]]
stopifnot(length(sim_obs$null) == 1L) ## To simplify code, we
## assume that there is only
## one realization of the
## observation model per
## realization of the
## process model.
stopifnot(length(sim_obs$test) == 1L)
tdata <- sim_obs$test[[1L]]
ndata <- sim_obs$null[[1L]]
make_df <- function(dat, repid, is_emergence){
t <- seq_along(dat)
data.frame(c(pvars, ovars), process_replicate_id = repid,
is_emergence = is_emergence,
cbind(time_index = t, reported_cases = dat))
}
tdf <- Map(make_df, dat = tdata, repid = seq_along(tdata),
is_emergence = TRUE)
tdn <- Map(make_df, dat = ndata, repid = seq_along(ndata),
is_emergence = FALSE)
res <- c(res, tdf, tdn)
}
}

reports_ts <- do.call(rbind, res)
## Delete potentially confusing column
reports_ts$scenario <- NULL
rownames(reports_ts) <- NULL

write.csv(reports_ts, file="reports_ts.csv")
save(reports_ts, file = "reports_ts.RData")
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 33d0685

Please sign in to comment.