Skip to content

Commit

Permalink
Update boosting and bart implementations in simulations (grf-labs#487)
Browse files Browse the repository at this point in the history
* updated boosting and bart implementations in simulations

* added wages simulation code

* edited wages code for clarity

* added wages call in main file

* udpated wages code to reflect new train test split

* added explicit rlearner call for readability
  • Loading branch information
rinafriedberg authored Aug 21, 2019
1 parent 5d79dd0 commit 8e5a2a4
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 68 deletions.
4 changes: 1 addition & 3 deletions experiments/local_linear_examples/bias_image.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ ll.forest = local_linear_forest(X, Y)

# lasso to select local linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha=1)
selected = which(coef(lasso.mod) != 0)
# remove intercept and adjust indexes correspondingly
selected = selected[2:length(selected)] - 1
selected = as.numeric(predict(lasso.mod, type = "nonzero"))
preds.llf = predict(ll.forest, linear.correction.variables = selected, tune.lambda = TRUE)$predictions

ticks = seq(-1, 1, length = 2000)
Expand Down
38 changes: 5 additions & 33 deletions experiments/local_linear_examples/boundary_bias_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,14 @@ library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(rlearner)

set.seed(1234)

mu = function(x){
log(1+exp(6*x[1]))
}

boosting.cv = function(dtrain, Y){

# cross-validate on a reasonable grid
etas = seq(0, 0.3, by = 0.1)
nrounds = c(50, 100, 500, 1000)
max_depth = seq(2, 8, by = 2)

args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
model.xgb = xgboost(dtrain, nrounds = arguments[2],
params = list(objective = "reg:linear"),
eval_metric = "rmse",
eta = arguments[1],
max_depth = arguments[3])
xgb.preds = predict(model.xgb, newdata = dtrain)
mean((xgb.preds - Y)**2)
}))
best.index = which.min(results)
return(args.cv[best.index,])
}

simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
errors = replicate(num.reps, {
X = matrix(runif(n*p, -1, 1), nrow = n)
Expand Down Expand Up @@ -57,7 +37,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){

# use lasso to select linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha = 1)
lasso.coef = predict(lasso.mod, type = "nonzero")
lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
if(!is.null(dim(lasso.coef))){
selected = lasso.coef[,1]
} else {
Expand Down Expand Up @@ -86,20 +66,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
lasso.rf.mse = mean((lasso.rf.preds-truth)**2)

bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

dtrain = xgb.DMatrix(X, label = Y)
cv.variables = boosting.cv(dtrain, Y)
cv.variables = as.numeric(cv.variables)
eta = cv.variables[1]
nrounds = cv.variables[2]
max_depth = cv.variables[3]

model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
eval_metric = "rmse", eta = eta, max_depth = max_depth)
xgb.preds = predict(model.xgb, newdata = X.test)
boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
Expand Down
38 changes: 6 additions & 32 deletions experiments/local_linear_examples/friedman_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,14 @@ library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(rlearner)

set.seed(1234)

ff = function(x){
return(10*sin(pi*x[1]*x[2]) + 20*((x[3] - 0.5)**2) + 10*x[4] + 5*x[5])
}

boosting.cv = function(dtrain, Y){

# cross-validate on a reasonable grid
etas = seq(0, 0.3, by = 0.1)
nrounds = c(50, 100, 500, 1000)
max_depth = seq(2, 8, by = 2)

args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
model.xgb = xgboost(dtrain, nrounds = arguments[2],
params = list(objective = "reg:linear"),
eval_metric = "rmse",
eta = arguments[1],
max_depth = arguments[3])
xgb.preds = predict(model.xgb, newdata = dtrain)
mean((xgb.preds - Y)**2)
}))
best.index = which.min(results)
return(args.cv[best.index,])
}

simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = 2000){
errors = replicate(num.reps, {
X = matrix(runif(n*p,0,1), nrow = n)
Expand Down Expand Up @@ -58,7 +38,8 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =

# use lasso to select linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha = 1)
lasso.coef = predict(lasso.mod, type = "nonzero")
lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
# if lasso chose no variables, use all for LL correction
if(!is.null(dim(lasso.coef))){
selected = lasso.coef[,1]
} else {
Expand Down Expand Up @@ -87,19 +68,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
lasso.rf.mse = mean((lasso.rf.preds-truth)**2)

bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

dtrain = xgb.DMatrix(X, label = Y)
cv.variables = as.numeric(boosting.cv(dtrain, Y))
eta = cv.variables[1]
nrounds = cv.variables[2]
max_depth = cv.variables[3]

model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
eval_metric = "rmse", eta = eta, max_depth = max_depth)
xgb.preds = predict(model.xgb, newdata = X.test)
boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
Expand Down
1 change: 1 addition & 0 deletions experiments/local_linear_examples/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ source("friedman_table.R")
source("confidence.R")
source("causal_table.R")
source("boundary_bias_table.R")
source("wages.R")
116 changes: 116 additions & 0 deletions experiments/local_linear_examples/wages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
rm(list = ls())
set.seed(123)

library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(ggplot2)
library(dplyr)
library(splines)
library(rlearner)

# load the data
load("cps1976_2018.RData")
data = data.frame(cps1976_2018)

# clean data: remove NA and missingg outcomes
data = data[!is.na(data$incwage),]
data = data[data$incwage != 9999999,] # remove missing
data = data[data$incwage != 9999998,] # remove missing

# extract 2018 data
data = data[data$year == 2018,]

# add age^2, educ^2 covariates
data$agesq = data$age**2
data$educsq = data$educ**2

covariates = c("age", "agesq", "educ", "educsq", "occ2010", "occ10ly", "sex", "race",
"marst", "labforce", "ind1950", "classwkr", "wkstat", "uhrswork1", "metro", "famsize")

continuous.covariates = which(covariates %in% c("age", "educ", "agesq", "educsq", "uhrswork1", "famsize"))
outcome = "incwage"

data = data[,c(covariates, outcome)]
data = data[complete.cases(data),]

# transform outcome (standard for wage regressions)
data$incwage = log(data$incwage + 1)

######################
## ERROR EVALUATION ##
######################

num.reps = 100
size.test = 10000
sample.sizes = c(2000, 5000, 10000, 50000)

mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
index.train = sample(1:nrow(data), size = size, replace = FALSE)

X = data[index.train, covariates]
Y = data$incwage[index.train]

X = X[complete.cases(X),]
Y = Y[complete.cases(X)]

results = data.frame(t(sapply(1:num.reps, function(i){
print(i)

index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)

X.test = data[index.test, covariates]
truth = data$incwage[index.test]

X.test = X.test[complete.cases(X.test),]
truth = truth[complete.cases(X.test)]

forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)

ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates,
ll.weight.penalty = T)$lambda.min
llf.preds = predict(forest, as.matrix(X.test),
linear.correction.variables = continuous.covariates,
ll.lambda = ll.lambda,
ll.weight.penalty = T)$predictions
llf.mse = mean((llf.preds - truth)**2)

rf.preds = predict(forest, as.matrix(X.test))$predictions
rf.mse = mean((rf.preds - truth)**2)

ols.form = as.formula(paste("Y", paste(covariates, collapse = "+"), sep = "~"))
dd.ols = cbind(Y, X)
ols.fit = lm(ols.form, dd.ols)
ols.preds = predict(ols.fit, X.test)
ols.mse = mean((ols.preds - truth)**2)

mm = model.matrix( ~.^2, data = X)
lasso.mod = cv.glmnet(mm, Y, alpha = 1)
mmtest = model.matrix( ~.^2, data = X.test)
lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
lasso.mse = mean((lasso.preds - truth)**2)

bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

return(c(ols.mse, llf.mse, rf.mse, lasso.mse, xg.mse, bart.mse))
})))

mses = colMeans(results)
sds = apply(results, MARGIN = 2, FUN = sd)

as.numeric(c(mses, sds))
})))

colnames(mse.sample.sizes) = c("OLS", "LLF", "RF", "Lasso", "XG", "BART",
"OLS.sd", "LLF.sd", "RF.sd", "Lasso.sd", "XG.sd", "BART.sd")

mse.sample.sizes$size = sample.sizes

write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)

0 comments on commit 8e5a2a4

Please sign in to comment.