Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update boosting and bart implementations in simulations #487

Merged
merged 7 commits into from
Aug 21, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions experiments/local_linear_examples/bias_image.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ ll.forest = local_linear_forest(X, Y)

# lasso to select local linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha=1)
selected = which(coef(lasso.mod) != 0)
# remove intercept and adjust indexes correspondingly
selected = selected[2:length(selected)] - 1
selected = as.numeric(predict(lasso.mod, type = "nonzero"))
preds.llf = predict(ll.forest, linear.correction.variables = selected, tune.lambda = TRUE)$predictions

ticks = seq(-1, 1, length = 2000)
Expand Down
38 changes: 5 additions & 33 deletions experiments/local_linear_examples/boundary_bias_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,14 @@ library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(rlearner)

set.seed(1234)

mu = function(x){
log(1+exp(6*x[1]))
}

boosting.cv = function(dtrain, Y){

# cross-validate on a reasonable grid
etas = seq(0, 0.3, by = 0.1)
nrounds = c(50, 100, 500, 1000)
max_depth = seq(2, 8, by = 2)

args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
model.xgb = xgboost(dtrain, nrounds = arguments[2],
params = list(objective = "reg:linear"),
eval_metric = "rmse",
eta = arguments[1],
max_depth = arguments[3])
xgb.preds = predict(model.xgb, newdata = dtrain)
mean((xgb.preds - Y)**2)
}))
best.index = which.min(results)
return(args.cv[best.index,])
}

simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
errors = replicate(num.reps, {
X = matrix(runif(n*p, -1, 1), nrow = n)
Expand Down Expand Up @@ -57,7 +37,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){

# use lasso to select linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha = 1)
lasso.coef = predict(lasso.mod, type = "nonzero")
lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
if(!is.null(dim(lasso.coef))){
selected = lasso.coef[,1]
} else {
Expand Down Expand Up @@ -86,20 +66,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
lasso.rf.mse = mean((lasso.rf.preds-truth)**2)

bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

dtrain = xgb.DMatrix(X, label = Y)
cv.variables = boosting.cv(dtrain, Y)
cv.variables = as.numeric(cv.variables)
eta = cv.variables[1]
nrounds = cv.variables[2]
max_depth = cv.variables[3]

model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
eval_metric = "rmse", eta = eta, max_depth = max_depth)
xgb.preds = predict(model.xgb, newdata = X.test)
boost.cv.fit = cvboost(as.matrix(X), Y)
rinafriedberg marked this conversation as resolved.
Show resolved Hide resolved
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
Expand Down
38 changes: 6 additions & 32 deletions experiments/local_linear_examples/friedman_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,34 +2,14 @@ library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(rlearner)

set.seed(1234)

ff = function(x){
return(10*sin(pi*x[1]*x[2]) + 20*((x[3] - 0.5)**2) + 10*x[4] + 5*x[5])
}

boosting.cv = function(dtrain, Y){

# cross-validate on a reasonable grid
etas = seq(0, 0.3, by = 0.1)
nrounds = c(50, 100, 500, 1000)
max_depth = seq(2, 8, by = 2)

args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
model.xgb = xgboost(dtrain, nrounds = arguments[2],
params = list(objective = "reg:linear"),
eval_metric = "rmse",
eta = arguments[1],
max_depth = arguments[3])
xgb.preds = predict(model.xgb, newdata = dtrain)
mean((xgb.preds - Y)**2)
}))
best.index = which.min(results)
return(args.cv[best.index,])
}

simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = 2000){
errors = replicate(num.reps, {
X = matrix(runif(n*p,0,1), nrow = n)
Expand Down Expand Up @@ -58,7 +38,8 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =

# use lasso to select linear correction variables
lasso.mod = cv.glmnet(X, Y, alpha = 1)
lasso.coef = predict(lasso.mod, type = "nonzero")
lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
# if lasso chose no variables, use all for LL correction
if(!is.null(dim(lasso.coef))){
selected = lasso.coef[,1]
} else {
Expand Down Expand Up @@ -87,19 +68,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
lasso.rf.mse = mean((lasso.rf.preds-truth)**2)

bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

dtrain = xgb.DMatrix(X, label = Y)
cv.variables = as.numeric(boosting.cv(dtrain, Y))
eta = cv.variables[1]
nrounds = cv.variables[2]
max_depth = cv.variables[3]

model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
eval_metric = "rmse", eta = eta, max_depth = max_depth)
xgb.preds = predict(model.xgb, newdata = X.test)
boost.cv.fit = cvboost(as.matrix(X), Y)
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
Expand Down
1 change: 1 addition & 0 deletions experiments/local_linear_examples/main.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ source("friedman_table.R")
source("confidence.R")
source("causal_table.R")
source("boundary_bias_table.R")
source("wages.R")
116 changes: 116 additions & 0 deletions experiments/local_linear_examples/wages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
rm(list = ls())
set.seed(123)

library(grf)
library(glmnet)
library(BART)
library(xgboost)
library(ggplot2)
library(dplyr)
library(splines)
library(rlearner)

# load the data
load("cps1976_2018.RData")
data = data.frame(cps1976_2018)

# clean data: remove NA and missingg outcomes
data = data[!is.na(data$incwage),]
data = data[data$incwage != 9999999,] # remove missing
data = data[data$incwage != 9999998,] # remove missing

# extract 2018 data
data = data[data$year == 2018,]

# add age^2, educ^2 covariates
data$agesq = data$age**2
data$educsq = data$educ**2

covariates = c("age", "agesq", "educ", "educsq", "occ2010", "occ10ly", "sex", "race",
"marst", "labforce", "ind1950", "classwkr", "wkstat", "uhrswork1", "metro", "famsize")

continuous.covariates = which(covariates %in% c("age", "educ", "agesq", "educsq", "uhrswork1", "famsize"))
outcome = "incwage"

data = data[,c(covariates, outcome)]
data = data[complete.cases(data),]

# transform outcome (standard for wage regressions)
data$incwage = log(data$incwage + 1)

######################
## ERROR EVALUATION ##
######################

num.reps = 100
size.test = 10000
sample.sizes = c(2000, 5000, 10000, 50000)

mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
index.train = sample(1:nrow(data), size = size, replace = FALSE)

X = data[index.train, covariates]
Y = data$incwage[index.train]

X = X[complete.cases(X),]
Y = Y[complete.cases(X)]

results = data.frame(t(sapply(1:num.reps, function(i){
print(i)

index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)

X.test = data[index.test, covariates]
truth = data$incwage[index.test]

X.test = X.test[complete.cases(X.test),]
truth = truth[complete.cases(X.test)]

forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)

ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates,
ll.weight.penalty = T)$lambda.min
llf.preds = predict(forest, as.matrix(X.test),
linear.correction.variables = continuous.covariates,
ll.lambda = ll.lambda,
ll.weight.penalty = T)$predictions
llf.mse = mean((llf.preds - truth)**2)

rf.preds = predict(forest, as.matrix(X.test))$predictions
rf.mse = mean((rf.preds - truth)**2)

ols.form = as.formula(paste("Y", paste(covariates, collapse = "+"), sep = "~"))
dd.ols = cbind(Y, X)
ols.fit = lm(ols.form, dd.ols)
ols.preds = predict(ols.fit, X.test)
ols.mse = mean((ols.preds - truth)**2)

mm = model.matrix( ~.^2, data = X)
lasso.mod = cv.glmnet(mm, Y, alpha = 1)
mmtest = model.matrix( ~.^2, data = X.test)
lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
lasso.mse = mean((lasso.preds - truth)**2)

bart.mod = wbart(X, Y, X.test)
bart.preds = bart.mod$yhat.test.mean
bart.mse = mean((bart.preds-truth)**2)

boost.cv.fit = cvboost(as.matrix(X), Y)
xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
xg.mse = mean((xgb.preds - truth)**2)

return(c(ols.mse, llf.mse, rf.mse, lasso.mse, xg.mse, bart.mse))
})))

mses = colMeans(results)
sds = apply(results, MARGIN = 2, FUN = sd)

as.numeric(c(mses, sds))
})))

colnames(mse.sample.sizes) = c("OLS", "LLF", "RF", "Lasso", "XG", "BART",
"OLS.sd", "LLF.sd", "RF.sd", "Lasso.sd", "XG.sd", "BART.sd")

mse.sample.sizes$size = sample.sizes

write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)