From 8e5a2a46ec246c8d3e3c76f5d89bfb6ca9093283 Mon Sep 17 00:00:00 2001
From: Rina Friedberg <rinafriedberg@gmail.com>
Date: Wed, 21 Aug 2019 15:23:51 -0700
Subject: [PATCH] Update boosting and bart implementations in simulations
 (#487)

* updated boosting and bart implementations in simulations

* added wages simulation code

* edited wages code for clarity

* added wages call in main file

* udpated wages code to reflect new train test split

* added explicit rlearner call for readability
---
 .../local_linear_examples/bias_image.R        |   4 +-
 .../boundary_bias_table.R                     |  38 +-----
 .../local_linear_examples/friedman_table.R    |  38 +-----
 experiments/local_linear_examples/main.R      |   1 +
 experiments/local_linear_examples/wages.R     | 116 ++++++++++++++++++
 5 files changed, 129 insertions(+), 68 deletions(-)
 create mode 100644 experiments/local_linear_examples/wages.R

diff --git a/experiments/local_linear_examples/bias_image.R b/experiments/local_linear_examples/bias_image.R
index 6dc6cc191..db61ac99e 100644
--- a/experiments/local_linear_examples/bias_image.R
+++ b/experiments/local_linear_examples/bias_image.R
@@ -17,9 +17,7 @@ ll.forest = local_linear_forest(X, Y)
 
 # lasso to select local linear correction variables
 lasso.mod = cv.glmnet(X, Y, alpha=1)
-selected = which(coef(lasso.mod) != 0)
-# remove intercept and adjust indexes correspondingly
-selected = selected[2:length(selected)] - 1
+selected = as.numeric(predict(lasso.mod, type = "nonzero"))
 preds.llf = predict(ll.forest, linear.correction.variables = selected, tune.lambda = TRUE)$predictions
 
 ticks = seq(-1, 1, length = 2000)
diff --git a/experiments/local_linear_examples/boundary_bias_table.R b/experiments/local_linear_examples/boundary_bias_table.R
index 9b681fa34..db3e940d7 100644
--- a/experiments/local_linear_examples/boundary_bias_table.R
+++ b/experiments/local_linear_examples/boundary_bias_table.R
@@ -2,6 +2,7 @@ library(grf)
 library(glmnet)
 library(BART) 
 library(xgboost) 
+library(rlearner)
 
 set.seed(1234)
 
@@ -9,27 +10,6 @@ mu = function(x){
   log(1+exp(6*x[1]))
 }
 
-boosting.cv = function(dtrain, Y){
-  
-  # cross-validate on a reasonable grid 
-  etas = seq(0, 0.3, by = 0.1)
-  nrounds = c(50, 100, 500, 1000)
-  max_depth = seq(2, 8, by = 2)
-  
-  args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
-  results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
-    model.xgb = xgboost(dtrain, nrounds = arguments[2], 
-                        params = list(objective = "reg:linear"),
-                        eval_metric = "rmse",
-                        eta = arguments[1], 
-                        max_depth = arguments[3])
-    xgb.preds = predict(model.xgb, newdata = dtrain)
-    mean((xgb.preds - Y)**2)
-  }))
-  best.index = which.min(results)
-  return(args.cv[best.index,])
-}
-
 simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
   errors = replicate(num.reps, {
     X = matrix(runif(n*p, -1, 1), nrow = n)
@@ -57,7 +37,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
     
     # use lasso to select linear correction variables
     lasso.mod = cv.glmnet(X, Y, alpha = 1)
-    lasso.coef = predict(lasso.mod, type = "nonzero")
+    lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
     if(!is.null(dim(lasso.coef))){
       selected = lasso.coef[,1]
     } else {
@@ -86,20 +66,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
     lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
     lasso.rf.mse = mean((lasso.rf.preds-truth)**2)
     
-    bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
+    bart.mod = wbart(X, Y, X.test)
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    dtrain = xgb.DMatrix(X, label = Y)
-    cv.variables = boosting.cv(dtrain, Y)
-    cv.variables = as.numeric(cv.variables)
-    eta = cv.variables[1]
-    nrounds = cv.variables[2]
-    max_depth = cv.variables[3]
-    
-    model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
-                        eval_metric = "rmse", eta = eta, max_depth = max_depth)
-    xgb.preds = predict(model.xgb, newdata = X.test)
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
     c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
diff --git a/experiments/local_linear_examples/friedman_table.R b/experiments/local_linear_examples/friedman_table.R
index 5ba3f9913..5d62c69cd 100644
--- a/experiments/local_linear_examples/friedman_table.R
+++ b/experiments/local_linear_examples/friedman_table.R
@@ -2,6 +2,7 @@ library(grf)
 library(glmnet)
 library(BART) 
 library(xgboost) 
+library(rlearner)
 
 set.seed(1234)
 
@@ -9,27 +10,6 @@ ff = function(x){
   return(10*sin(pi*x[1]*x[2]) + 20*((x[3] - 0.5)**2) + 10*x[4] + 5*x[5])
 }
 
-boosting.cv = function(dtrain, Y){
-  
-  # cross-validate on a reasonable grid 
-  etas = seq(0, 0.3, by = 0.1)
-  nrounds = c(50, 100, 500, 1000)
-  max_depth = seq(2, 8, by = 2)
-  
-  args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
-  results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
-    model.xgb = xgboost(dtrain, nrounds = arguments[2], 
-                        params = list(objective = "reg:linear"),
-                        eval_metric = "rmse",
-                        eta = arguments[1], 
-                        max_depth = arguments[3])
-    xgb.preds = predict(model.xgb, newdata = dtrain)
-    mean((xgb.preds - Y)**2)
-  }))
-  best.index = which.min(results)
-  return(args.cv[best.index,])
-}
-
 simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = 2000){
   errors = replicate(num.reps, {
     X = matrix(runif(n*p,0,1), nrow = n)
@@ -58,7 +38,8 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
     
     # use lasso to select linear correction variables
     lasso.mod = cv.glmnet(X, Y, alpha = 1)
-    lasso.coef = predict(lasso.mod, type = "nonzero")
+    lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
+    # if lasso chose no variables, use all for LL correction
     if(!is.null(dim(lasso.coef))){
       selected = lasso.coef[,1]
     } else {
@@ -87,19 +68,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
     lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
     lasso.rf.mse = mean((lasso.rf.preds-truth)**2)
     
-    bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
+    bart.mod = wbart(X, Y, X.test)
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    dtrain = xgb.DMatrix(X, label = Y)
-    cv.variables = as.numeric(boosting.cv(dtrain, Y))
-    eta = cv.variables[1]
-    nrounds = cv.variables[2]
-    max_depth = cv.variables[3]
-    
-    model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
-                        eval_metric = "rmse", eta = eta, max_depth = max_depth)
-    xgb.preds = predict(model.xgb, newdata = X.test)
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
     c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
diff --git a/experiments/local_linear_examples/main.R b/experiments/local_linear_examples/main.R
index 5ec784c0f..63354f524 100644
--- a/experiments/local_linear_examples/main.R
+++ b/experiments/local_linear_examples/main.R
@@ -10,3 +10,4 @@ source("friedman_table.R")
 source("confidence.R")  
 source("causal_table.R") 
 source("boundary_bias_table.R")
+source("wages.R")
\ No newline at end of file
diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R
new file mode 100644
index 000000000..a15e4ed3f
--- /dev/null
+++ b/experiments/local_linear_examples/wages.R
@@ -0,0 +1,116 @@
+rm(list = ls())
+set.seed(123)
+
+library(grf)
+library(glmnet)
+library(BART) 
+library(xgboost) 
+library(ggplot2)
+library(dplyr)
+library(splines)
+library(rlearner)
+
+# load the data 
+load("cps1976_2018.RData")
+data = data.frame(cps1976_2018)
+
+# clean data: remove NA and missingg outcomes 
+data = data[!is.na(data$incwage),]
+data = data[data$incwage != 9999999,] # remove missing
+data = data[data$incwage != 9999998,] # remove missing
+
+# extract 2018 data
+data = data[data$year == 2018,]
+
+# add age^2, educ^2 covariates 
+data$agesq = data$age**2
+data$educsq = data$educ**2
+
+covariates = c("age", "agesq", "educ", "educsq", "occ2010", "occ10ly", "sex", "race",
+               "marst", "labforce", "ind1950", "classwkr", "wkstat", "uhrswork1", "metro", "famsize")
+
+continuous.covariates = which(covariates %in% c("age", "educ", "agesq", "educsq", "uhrswork1", "famsize"))
+outcome = "incwage"
+
+data = data[,c(covariates, outcome)]
+data = data[complete.cases(data),]
+
+# transform outcome (standard for wage regressions)
+data$incwage = log(data$incwage + 1)
+
+######################
+## ERROR EVALUATION ##
+######################
+
+num.reps = 100
+size.test = 10000
+sample.sizes = c(2000, 5000, 10000, 50000)
+
+mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
+  index.train = sample(1:nrow(data), size = size, replace = FALSE)
+  
+  X = data[index.train, covariates]
+  Y = data$incwage[index.train]
+  
+  X = X[complete.cases(X),]
+  Y = Y[complete.cases(X)]
+  
+  results = data.frame(t(sapply(1:num.reps, function(i){
+    print(i)
+    
+    index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)
+    
+    X.test = data[index.test, covariates]
+    truth = data$incwage[index.test]
+    
+    X.test = X.test[complete.cases(X.test),]
+    truth = truth[complete.cases(X.test)]
+    
+    forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)
+    
+    ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates, 
+                                          ll.weight.penalty = T)$lambda.min
+    llf.preds = predict(forest, as.matrix(X.test), 
+                              linear.correction.variables = continuous.covariates,
+                              ll.lambda = ll.lambda, 
+                              ll.weight.penalty = T)$predictions
+    llf.mse = mean((llf.preds - truth)**2)
+    
+    rf.preds = predict(forest, as.matrix(X.test))$predictions 
+    rf.mse = mean((rf.preds - truth)**2)
+    
+    ols.form = as.formula(paste("Y", paste(covariates, collapse = "+"), sep = "~"))
+    dd.ols = cbind(Y, X)
+    ols.fit = lm(ols.form, dd.ols)
+    ols.preds = predict(ols.fit, X.test)
+    ols.mse = mean((ols.preds - truth)**2)
+    
+    mm = model.matrix( ~.^2, data = X)
+    lasso.mod = cv.glmnet(mm, Y, alpha = 1)
+    mmtest = model.matrix( ~.^2, data = X.test)
+    lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
+    lasso.mse = mean((lasso.preds - truth)**2)
+    
+    bart.mod = wbart(X, Y, X.test)
+    bart.preds = bart.mod$yhat.test.mean
+    bart.mse = mean((bart.preds-truth)**2)
+    
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
+    xg.mse = mean((xgb.preds - truth)**2)
+    
+    return(c(ols.mse, llf.mse, rf.mse, lasso.mse, xg.mse, bart.mse))
+  })))
+  
+  mses = colMeans(results)
+  sds = apply(results, MARGIN = 2, FUN = sd)
+  
+  as.numeric(c(mses, sds))
+})))
+
+colnames(mse.sample.sizes) = c("OLS", "LLF", "RF", "Lasso", "XG", "BART",
+                               "OLS.sd", "LLF.sd", "RF.sd", "Lasso.sd", "XG.sd", "BART.sd")
+
+mse.sample.sizes$size = sample.sizes
+
+write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)
\ No newline at end of file