From 34857b79b6d6a3a157ebd30674e03ee90e454663 Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Mon, 19 Aug 2019 18:12:34 -0700
Subject: [PATCH 1/6] updated boosting and bart implementations in simulations

---
 .../local_linear_examples/bias_image.R        |  4 +-
 .../boundary_bias_table.R                     | 38 +++----------------
 .../local_linear_examples/friedman_table.R    | 38 +++----------------
 3 files changed, 12 insertions(+), 68 deletions(-)

diff --git a/experiments/local_linear_examples/bias_image.R b/experiments/local_linear_examples/bias_image.R
index 6dc6cc191..db61ac99e 100644
--- a/experiments/local_linear_examples/bias_image.R
+++ b/experiments/local_linear_examples/bias_image.R
@@ -17,9 +17,7 @@ ll.forest = local_linear_forest(X, Y)
 
 # lasso to select local linear correction variables
 lasso.mod = cv.glmnet(X, Y, alpha=1)
-selected = which(coef(lasso.mod) != 0)
-# remove intercept and adjust indexes correspondingly
-selected = selected[2:length(selected)] - 1
+selected = as.numeric(predict(lasso.mod, type = "nonzero"))
 preds.llf = predict(ll.forest, linear.correction.variables = selected, tune.lambda = TRUE)$predictions
 
 ticks = seq(-1, 1, length = 2000)
diff --git a/experiments/local_linear_examples/boundary_bias_table.R b/experiments/local_linear_examples/boundary_bias_table.R
index 9b681fa34..38e364aa8 100644
--- a/experiments/local_linear_examples/boundary_bias_table.R
+++ b/experiments/local_linear_examples/boundary_bias_table.R
@@ -2,6 +2,7 @@ library(grf)
 library(glmnet)
 library(BART) 
 library(xgboost) 
+library(rlearner)
 
 set.seed(1234)
 
@@ -9,27 +10,6 @@ mu = function(x){
   log(1+exp(6*x[1]))
 }
 
-boosting.cv = function(dtrain, Y){
-  
-  # cross-validate on a reasonable grid 
-  etas = seq(0, 0.3, by = 0.1)
-  nrounds = c(50, 100, 500, 1000)
-  max_depth = seq(2, 8, by = 2)
-  
-  args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
-  results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
-    model.xgb = xgboost(dtrain, nrounds = arguments[2], 
-                        params = list(objective = "reg:linear"),
-                        eval_metric = "rmse",
-                        eta = arguments[1], 
-                        max_depth = arguments[3])
-    xgb.preds = predict(model.xgb, newdata = dtrain)
-    mean((xgb.preds - Y)**2)
-  }))
-  best.index = which.min(results)
-  return(args.cv[best.index,])
-}
-
 simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
   errors = replicate(num.reps, {
     X = matrix(runif(n*p, -1, 1), nrow = n)
@@ -57,7 +37,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
     
     # use lasso to select linear correction variables
     lasso.mod = cv.glmnet(X, Y, alpha = 1)
-    lasso.coef = predict(lasso.mod, type = "nonzero")
+    lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
     if(!is.null(dim(lasso.coef))){
       selected = lasso.coef[,1]
     } else {
@@ -86,20 +66,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
     lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
     lasso.rf.mse = mean((lasso.rf.preds-truth)**2)
     
-    bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
+    bart.mod = wbart(X, Y, X.test)
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    dtrain = xgb.DMatrix(X, label = Y)
-    cv.variables = boosting.cv(dtrain, Y)
-    cv.variables = as.numeric(cv.variables)
-    eta = cv.variables[1]
-    nrounds = cv.variables[2]
-    max_depth = cv.variables[3]
-    
-    model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
-                        eval_metric = "rmse", eta = eta, max_depth = max_depth)
-    xgb.preds = predict(model.xgb, newdata = X.test)
+    boost.cv.fit = cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
     c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)
diff --git a/experiments/local_linear_examples/friedman_table.R b/experiments/local_linear_examples/friedman_table.R
index 5ba3f9913..6d9652655 100644
--- a/experiments/local_linear_examples/friedman_table.R
+++ b/experiments/local_linear_examples/friedman_table.R
@@ -2,6 +2,7 @@ library(grf)
 library(glmnet)
 library(BART) 
 library(xgboost) 
+library(rlearner)
 
 set.seed(1234)
 
@@ -9,27 +10,6 @@ ff = function(x){
   return(10*sin(pi*x[1]*x[2]) + 20*((x[3] - 0.5)**2) + 10*x[4] + 5*x[5])
 }
 
-boosting.cv = function(dtrain, Y){
-  
-  # cross-validate on a reasonable grid 
-  etas = seq(0, 0.3, by = 0.1)
-  nrounds = c(50, 100, 500, 1000)
-  max_depth = seq(2, 8, by = 2)
-  
-  args.cv = expand.grid(e = etas, n = nrounds, d = max_depth)
-  results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){
-    model.xgb = xgboost(dtrain, nrounds = arguments[2], 
-                        params = list(objective = "reg:linear"),
-                        eval_metric = "rmse",
-                        eta = arguments[1], 
-                        max_depth = arguments[3])
-    xgb.preds = predict(model.xgb, newdata = dtrain)
-    mean((xgb.preds - Y)**2)
-  }))
-  best.index = which.min(results)
-  return(args.cv[best.index,])
-}
-
 simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = 2000){
   errors = replicate(num.reps, {
     X = matrix(runif(n*p,0,1), nrow = n)
@@ -58,7 +38,8 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
     
     # use lasso to select linear correction variables
     lasso.mod = cv.glmnet(X, Y, alpha = 1)
-    lasso.coef = predict(lasso.mod, type = "nonzero")
+    lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero"))
+    # if lasso chose no variables, use all for LL correction
     if(!is.null(dim(lasso.coef))){
       selected = lasso.coef[,1]
     } else {
@@ -87,19 +68,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
     lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min")
     lasso.rf.mse = mean((lasso.rf.preds-truth)**2)
     
-    bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5)
+    bart.mod = wbart(X, Y, X.test)
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    dtrain = xgb.DMatrix(X, label = Y)
-    cv.variables = as.numeric(boosting.cv(dtrain, Y))
-    eta = cv.variables[1]
-    nrounds = cv.variables[2]
-    max_depth = cv.variables[3]
-    
-    model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"),
-                        eval_metric = "rmse", eta = eta, max_depth = max_depth)
-    xgb.preds = predict(model.xgb, newdata = X.test)
+    boost.cv.fit = cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
     c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse)

From 8ef66ed3b5d6cd8d6e265e87c2c7544b9dbd1669 Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Mon, 19 Aug 2019 18:20:34 -0700
Subject: [PATCH 2/6] added wages simulation code

---
 experiments/local_linear_examples/wages.R | 228 ++++++++++++++++++++++
 1 file changed, 228 insertions(+)
 create mode 100644 experiments/local_linear_examples/wages.R

diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R
new file mode 100644
index 000000000..8aa1030ee
--- /dev/null
+++ b/experiments/local_linear_examples/wages.R
@@ -0,0 +1,228 @@
+rm(list = ls())
+set.seed(123)
+
+library(grf)
+library(glmnet)
+library(BART) 
+library(xgboost) 
+library(ggplot2)
+library(dplyr)
+library(splines)
+
+# load the data 
+load("cps1976_2018.RData")
+data = data.frame(cps1976_2018)
+
+# clean data: remove NA and missingg outcomes 
+data = data[!is.na(data$incwage),]
+data = data[data$incwage != 9999999,] # remove missing
+data = data[data$incwage != 9999998,] # remove missing
+
+# extract 2018 data
+data = data[data$year == 2018,]
+
+# add age^2, educ^2 covariates 
+data$agesq = data$age**2
+data$educsq = data$educ**2
+
+covariates = c("age", "agesq", "educ", "educsq", "occ2010", "occ10ly", "sex", "race",
+               "marst", "labforce", "ind1950", "classwkr", "wkstat", "uhrswork1", "metro", "famsize")
+
+continuous.covariates = which(covariates %in% c("age", "educ", "agesq", "educsq", "uhrswork1", "famsize"))
+outcome = "incwage"
+
+data = data[,c(covariates, outcome)]
+data = data[complete.cases(data),]
+
+# transform outcome (standard for wage regressions)
+data$incwage = log(data$incwage + 1)
+
+######################
+## ERROR EVALUATION ##
+######################
+
+num.reps = 100
+size.test = 10000
+sample.sizes = c(2000, 5000, 10000, 50000)
+
+mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
+
+  results = data.frame(t(sapply(1:num.reps, function(i){
+    index.train = sample(1:nrow(data), size = size, replace = FALSE)
+    index.test = sample(1:nrow(data), size = size.test, replace = FALSE)
+    
+    X = data[index.train, covariates]
+    Y = data$incwage[index.train]
+    
+    X = X[complete.cases(X),]
+    Y = Y[complete.cases(X)]
+    
+    forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)
+    
+    X.test = data[index.test, covariates]
+    truth = data$incwage[index.test]
+    
+    X.test = X.test[complete.cases(X.test),]
+    truth = truth[complete.cases(X.test)]
+    
+    ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates, 
+                                          ll.weight.penalty = TRUE)$lambda.min
+    llf.lasso.preds = predict(forest, as.matrix(X.test), 
+                              linear.correction.variables = continuous.covariates,
+                              ll.lambda = ll.lambda, 
+                              ll.weight.penalty = TRUE)$predictions
+    llf.mse = mean((llf.lasso.preds - truth)**2)
+    
+    rf.preds = predict(forest, as.matrix(X.test))$predictions 
+    rf.mse = mean((rf.preds - truth)**2)
+    
+    ols.form = as.formula(paste("Y", paste(covariates, collapse = "+"), sep = "~"))
+    dd.ols = cbind(Y, X)
+    ols.fit = lm(ols.form, dd.ols)
+    ols.preds = predict(ols.fit, X.test)
+    ols.mse = mean((ols.preds - truth)**2)
+    
+    mm = model.matrix( ~.^2, data = X)
+    lasso.mod = cv.glmnet(mm, Y, alpha = 1)
+    mmtest = model.matrix( ~.^2, data = X.test)
+    lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
+    lasso.mse = mean((lasso.preds - truth)**2)
+    
+    bart.mod = wbart(X, Y, X.test)
+    bart.preds = bart.mod$yhat.test.mean
+    bart.mse = mean((bart.preds-truth)**2)
+    
+    boost.cv.fit = cvboost(as.matrix(X), Y)
+    xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
+    xg.mse = mean((xgb.preds - truth)**2)
+    
+    return(c(ols.mse, llf.mse, rf.mse, lasso.mse, xg.mse, bart.mse))
+  })))
+  
+  mses = colMeans(results)
+  sds = apply(results, MARGIN = 2, FUN = sd)
+  
+  as.numeric(c(mses, sds))
+})))
+
+colnames(mse.sample.sizes) = c("OLS", "Lasso", "RF", "LLF", "XG", "BART",
+                               "OLS.sd", "Lasso.sd", "RF.sd", "LLF.sd", "XG.sd", "BART.sd")
+
+mse.sample.sizes$size = sample.sizes
+
+write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)
+
+############################
+## CALIBRATION AND IMAGES ##
+############################
+
+train.index = sample(nrow(data), size = floor(nrow(data)/2))
+train = data[train.index,]
+test = data[-train.index,]
+
+subsets = c("age", "race", "educ", "famsize")
+range.dummies = c("age.r", "race.r", "educ.r", "famsize.r")
+test$age.r = test$age < 20 | test$age > 85
+test$race.r = test$race > 200
+test$educ.r = test$educ > 111
+test$famsize.r = test$famsize > 6
+
+methods = c("lasso", "llf", "ols", "rf")
+
+forest.fit = regression_forest(train[,covariates], train$incwage, tune.parameters = T)
+ols.mod = lm(incwage ~ ., data = train[,c(covariates, "incwage")])
+mm = model.matrix( ~.^2, data = train[,covariates])
+lasso.mod = cv.glmnet(mm, train$incwage, alpha = 1)
+
+# generate plots for sparse regions and find corresponding t-statistics
+t_statistics = data.frame(t(sapply(1:length(subsets), function(i){
+  
+  test.subset = subsets[i]
+  range.subset = range.dummies[i]
+  testdata = test[test[,range.subset] == T,]
+  
+  names = sapply(methods, function(method){
+    paste(paste(method, test.subset, sep = "_"), "jpeg", sep = ".")
+  })
+  
+  rf.preds = predict(forest.fit, testdata[,covariates])$predictions
+  
+  tl = tune_ll_regression_forest(forest.fit, linear.correction.variables = continuous.covariates,
+                                 ll.weight.penalty = TRUE)$lambda.min
+  llf.preds = predict(forest.fit, testdata[,covariates], linear.correction.variables = continuous.covariates,
+                      ll.lambda = tl, ll.weight.penalty = TRUE)$predictions
+  
+  ols.preds = predict(ols.mod, newdata = testdata)
+  
+  mmtest = model.matrix( ~.^2, data=testdata[,covariates])
+  lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
+  
+  dt = data.frame("Wages" = testdata$incwage,
+                  "LLF" = llf.preds,
+                  "RF" = rf.preds,
+                  "OLS" = ols.preds,
+                  "Lasso" = lasso.preds)
+  colnames(dt)[5] = "Lasso"
+  
+  spline.fit = lm(dt$Wages ~ ns(dt$Lasso, df = 3))
+  dt$spline = spline.fit$fitted.values
+  ggplot(dt, aes(x = Lasso, y = Wages)) + geom_point(cex = 1) + 
+    ylab("Observed log wages") + xlab("Predicted log wages") + 
+    geom_line(aes(x = Lasso, y = spline), color = "red", lty = 1, lwd = 1.5) + 
+    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
+    theme_bw() +
+    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
+    xlim(c(range(dt$Lasso, dt$Wages))) + ylim(c(range(dt$Lasso, dt$Wages)))
+  ggsave(names[1])
+  
+  spline.fit = lm(dt$Wages ~ ns(dt$LLF, df = 3))
+  dt$spline = spline.fit$fitted.values
+  ggplot(dt, aes(x = LLF, y = Wages)) + geom_point(cex = 1) + 
+    ylab("Observed log wages") + xlab("Predicted log wages") + 
+    geom_line(aes(x = LLF, y = spline), color = "red", lty = 1, lwd = 1.5) + 
+    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
+    theme_bw() +
+    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
+    xlim(c(range(dt$LLF, dt$Wages))) + ylim(c(range(dt$LLF, dt$Wages)))
+  ggsave(names[2])
+  
+  spline.fit = lm(dt$Wages ~ ns(dt$RF, df = 3))
+  dt$spline = spline.fit$fitted.values
+  ggplot(dt, aes(x = RF, y = Wages)) + geom_point(cex = 1) + 
+    ylab("Observed log wages") + xlab("Predicted log wages") + 
+    geom_line(aes(x = RF, y = spline), color = "red", lty = 1, lwd = 1.5) + 
+    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
+    theme_bw() +
+    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
+    xlim(c(range(dt$RF, dt$Wages))) + ylim(c(range(dt$RF, dt$Wages)))
+  ggsave(names[3])
+  
+  spline.fit = lm(dt$Wages ~ ns(dt$OLS, df = 3))
+  dt$spline = spline.fit$fitted.values
+  ggplot(dt, aes(x = OLS, y = Wages)) + geom_point(cex = 1) + 
+    ylab("Observed log wages") + xlab("Predicted log wages") + 
+    geom_line(aes(x = OLS, y = spline), color = "red", lty = 1, lwd = 1.5) + 
+    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
+    theme_bw() +
+    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
+    xlim(c(range(dt$OLS, dt$Wages))) + ylim(c(range(dt$OLS, dt$Wages)))
+  ggsave(names[4])
+  
+  # squared differences: 
+  ols.sds = (ols.preds - testdata$incwage)**2
+  llf.sds = (llf.preds- testdata$incwage)**2
+  lasso.sds = (lasso.preds - testdata$incwage)**2
+  rf.sds = (rf.preds - testdata$incwage)**2
+  
+  # corresponding paired t-statistics: 
+  t.ols = mean(ols.sds - llf.sds)/(sd(ols.sds - llf.sds)/sqrt(nrow(testdata)))
+  t.lasso = mean(lasso.sds - llf.sds)/(sd(lasso.sds - llf.sds)/sqrt(nrow(testdata)))
+  t.rf = mean(rf.sds - llf.sds)/(sd(rf.sds - llf.sds)/sqrt(nrow(testdata)))
+  
+  c(t.ols, t.lasso, t.rf)
+})))
+
+colnames(t_statistics) = c("OLS", "Lasso", "RF")
+rownames(t_statistics) = subsets
+
+write.csv(t_statistics,"t_statistics.csv", row.names = TRUE)

From a454f5c6819a10d2dc07c12eaa7f2ad32428f872 Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Mon, 19 Aug 2019 18:24:00 -0700
Subject: [PATCH 3/6] edited wages code for clarity

---
 experiments/local_linear_examples/wages.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R
index 8aa1030ee..8b3d8c286 100644
--- a/experiments/local_linear_examples/wages.R
+++ b/experiments/local_linear_examples/wages.R
@@ -8,6 +8,7 @@ library(xgboost)
 library(ggplot2)
 library(dplyr)
 library(splines)
+library(rlearner)
 
 # load the data 
 load("cps1976_2018.RData")
@@ -49,7 +50,7 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
 
   results = data.frame(t(sapply(1:num.reps, function(i){
     index.train = sample(1:nrow(data), size = size, replace = FALSE)
-    index.test = sample(1:nrow(data), size = size.test, replace = FALSE)
+    index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)
     
     X = data[index.train, covariates]
     Y = data$incwage[index.train]

From 8bced9724a857ed1ed3dd1de33d56b9dee1ee182 Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Mon, 19 Aug 2019 18:24:53 -0700
Subject: [PATCH 4/6] added wages call in main file

---
 experiments/local_linear_examples/main.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/experiments/local_linear_examples/main.R b/experiments/local_linear_examples/main.R
index 5ec784c0f..63354f524 100644
--- a/experiments/local_linear_examples/main.R
+++ b/experiments/local_linear_examples/main.R
@@ -10,3 +10,4 @@ source("friedman_table.R")
 source("confidence.R")  
 source("causal_table.R") 
 source("boundary_bias_table.R")
+source("wages.R")
\ No newline at end of file

From 206e68be3c8995121947368985aa77a3fe17453f Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Wed, 21 Aug 2019 11:35:52 -0700
Subject: [PATCH 5/6] udpated wages code to reflect new train test split

---
 experiments/local_linear_examples/wages.R | 151 +++-------------------
 1 file changed, 19 insertions(+), 132 deletions(-)

diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R
index 8b3d8c286..7a68c0909 100644
--- a/experiments/local_linear_examples/wages.R
+++ b/experiments/local_linear_examples/wages.R
@@ -47,18 +47,18 @@ size.test = 10000
 sample.sizes = c(2000, 5000, 10000, 50000)
 
 mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
-
+  index.train = sample(1:nrow(data), size = size, replace = FALSE)
+  
+  X = data[index.train, covariates]
+  Y = data$incwage[index.train]
+  
+  X = X[complete.cases(X),]
+  Y = Y[complete.cases(X)]
+  
   results = data.frame(t(sapply(1:num.reps, function(i){
-    index.train = sample(1:nrow(data), size = size, replace = FALSE)
-    index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)
+    print(i)
     
-    X = data[index.train, covariates]
-    Y = data$incwage[index.train]
-    
-    X = X[complete.cases(X),]
-    Y = Y[complete.cases(X)]
-    
-    forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)
+    index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE)
     
     X.test = data[index.test, covariates]
     truth = data$incwage[index.test]
@@ -66,13 +66,15 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
     X.test = X.test[complete.cases(X.test),]
     truth = truth[complete.cases(X.test)]
     
+    forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE)
+    
     ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates, 
-                                          ll.weight.penalty = TRUE)$lambda.min
-    llf.lasso.preds = predict(forest, as.matrix(X.test), 
+                                          ll.weight.penalty = T)$lambda.min
+    llf.preds = predict(forest, as.matrix(X.test), 
                               linear.correction.variables = continuous.covariates,
                               ll.lambda = ll.lambda, 
-                              ll.weight.penalty = TRUE)$predictions
-    llf.mse = mean((llf.lasso.preds - truth)**2)
+                              ll.weight.penalty = T)$predictions
+    llf.mse = mean((llf.preds - truth)**2)
     
     rf.preds = predict(forest, as.matrix(X.test))$predictions 
     rf.mse = mean((rf.preds - truth)**2)
@@ -106,124 +108,9 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
   as.numeric(c(mses, sds))
 })))
 
-colnames(mse.sample.sizes) = c("OLS", "Lasso", "RF", "LLF", "XG", "BART",
-                               "OLS.sd", "Lasso.sd", "RF.sd", "LLF.sd", "XG.sd", "BART.sd")
+colnames(mse.sample.sizes) = c("OLS", "LLF", "RF", "Lasso", "XG", "BART",
+                               "OLS.sd", "LLF.sd", "RF.sd", "Lasso.sd", "XG.sd", "BART.sd")
 
 mse.sample.sizes$size = sample.sizes
 
-write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)
-
-############################
-## CALIBRATION AND IMAGES ##
-############################
-
-train.index = sample(nrow(data), size = floor(nrow(data)/2))
-train = data[train.index,]
-test = data[-train.index,]
-
-subsets = c("age", "race", "educ", "famsize")
-range.dummies = c("age.r", "race.r", "educ.r", "famsize.r")
-test$age.r = test$age < 20 | test$age > 85
-test$race.r = test$race > 200
-test$educ.r = test$educ > 111
-test$famsize.r = test$famsize > 6
-
-methods = c("lasso", "llf", "ols", "rf")
-
-forest.fit = regression_forest(train[,covariates], train$incwage, tune.parameters = T)
-ols.mod = lm(incwage ~ ., data = train[,c(covariates, "incwage")])
-mm = model.matrix( ~.^2, data = train[,covariates])
-lasso.mod = cv.glmnet(mm, train$incwage, alpha = 1)
-
-# generate plots for sparse regions and find corresponding t-statistics
-t_statistics = data.frame(t(sapply(1:length(subsets), function(i){
-  
-  test.subset = subsets[i]
-  range.subset = range.dummies[i]
-  testdata = test[test[,range.subset] == T,]
-  
-  names = sapply(methods, function(method){
-    paste(paste(method, test.subset, sep = "_"), "jpeg", sep = ".")
-  })
-  
-  rf.preds = predict(forest.fit, testdata[,covariates])$predictions
-  
-  tl = tune_ll_regression_forest(forest.fit, linear.correction.variables = continuous.covariates,
-                                 ll.weight.penalty = TRUE)$lambda.min
-  llf.preds = predict(forest.fit, testdata[,covariates], linear.correction.variables = continuous.covariates,
-                      ll.lambda = tl, ll.weight.penalty = TRUE)$predictions
-  
-  ols.preds = predict(ols.mod, newdata = testdata)
-  
-  mmtest = model.matrix( ~.^2, data=testdata[,covariates])
-  lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min)
-  
-  dt = data.frame("Wages" = testdata$incwage,
-                  "LLF" = llf.preds,
-                  "RF" = rf.preds,
-                  "OLS" = ols.preds,
-                  "Lasso" = lasso.preds)
-  colnames(dt)[5] = "Lasso"
-  
-  spline.fit = lm(dt$Wages ~ ns(dt$Lasso, df = 3))
-  dt$spline = spline.fit$fitted.values
-  ggplot(dt, aes(x = Lasso, y = Wages)) + geom_point(cex = 1) + 
-    ylab("Observed log wages") + xlab("Predicted log wages") + 
-    geom_line(aes(x = Lasso, y = spline), color = "red", lty = 1, lwd = 1.5) + 
-    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
-    theme_bw() +
-    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
-    xlim(c(range(dt$Lasso, dt$Wages))) + ylim(c(range(dt$Lasso, dt$Wages)))
-  ggsave(names[1])
-  
-  spline.fit = lm(dt$Wages ~ ns(dt$LLF, df = 3))
-  dt$spline = spline.fit$fitted.values
-  ggplot(dt, aes(x = LLF, y = Wages)) + geom_point(cex = 1) + 
-    ylab("Observed log wages") + xlab("Predicted log wages") + 
-    geom_line(aes(x = LLF, y = spline), color = "red", lty = 1, lwd = 1.5) + 
-    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
-    theme_bw() +
-    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
-    xlim(c(range(dt$LLF, dt$Wages))) + ylim(c(range(dt$LLF, dt$Wages)))
-  ggsave(names[2])
-  
-  spline.fit = lm(dt$Wages ~ ns(dt$RF, df = 3))
-  dt$spline = spline.fit$fitted.values
-  ggplot(dt, aes(x = RF, y = Wages)) + geom_point(cex = 1) + 
-    ylab("Observed log wages") + xlab("Predicted log wages") + 
-    geom_line(aes(x = RF, y = spline), color = "red", lty = 1, lwd = 1.5) + 
-    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
-    theme_bw() +
-    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
-    xlim(c(range(dt$RF, dt$Wages))) + ylim(c(range(dt$RF, dt$Wages)))
-  ggsave(names[3])
-  
-  spline.fit = lm(dt$Wages ~ ns(dt$OLS, df = 3))
-  dt$spline = spline.fit$fitted.values
-  ggplot(dt, aes(x = OLS, y = Wages)) + geom_point(cex = 1) + 
-    ylab("Observed log wages") + xlab("Predicted log wages") + 
-    geom_line(aes(x = OLS, y = spline), color = "red", lty = 1, lwd = 1.5) + 
-    geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + 
-    theme_bw() +
-    theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + 
-    xlim(c(range(dt$OLS, dt$Wages))) + ylim(c(range(dt$OLS, dt$Wages)))
-  ggsave(names[4])
-  
-  # squared differences: 
-  ols.sds = (ols.preds - testdata$incwage)**2
-  llf.sds = (llf.preds- testdata$incwage)**2
-  lasso.sds = (lasso.preds - testdata$incwage)**2
-  rf.sds = (rf.preds - testdata$incwage)**2
-  
-  # corresponding paired t-statistics: 
-  t.ols = mean(ols.sds - llf.sds)/(sd(ols.sds - llf.sds)/sqrt(nrow(testdata)))
-  t.lasso = mean(lasso.sds - llf.sds)/(sd(lasso.sds - llf.sds)/sqrt(nrow(testdata)))
-  t.rf = mean(rf.sds - llf.sds)/(sd(rf.sds - llf.sds)/sqrt(nrow(testdata)))
-  
-  c(t.ols, t.lasso, t.rf)
-})))
-
-colnames(t_statistics) = c("OLS", "Lasso", "RF")
-rownames(t_statistics) = subsets
-
-write.csv(t_statistics,"t_statistics.csv", row.names = TRUE)
+write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE)
\ No newline at end of file

From 819995e4aeb0f1c0f2926e14b61ea8965df25734 Mon Sep 17 00:00:00 2001
From: Rina Siller Friedberg <rinafriedberg@gmail.com>
Date: Wed, 21 Aug 2019 14:47:47 -0700
Subject: [PATCH 6/6] added explicit rlearner call for readability

---
 experiments/local_linear_examples/boundary_bias_table.R | 2 +-
 experiments/local_linear_examples/friedman_table.R      | 2 +-
 experiments/local_linear_examples/wages.R               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/experiments/local_linear_examples/boundary_bias_table.R b/experiments/local_linear_examples/boundary_bias_table.R
index 38e364aa8..db3e940d7 100644
--- a/experiments/local_linear_examples/boundary_bias_table.R
+++ b/experiments/local_linear_examples/boundary_bias_table.R
@@ -70,7 +70,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    boost.cv.fit = cvboost(as.matrix(X), Y)
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
     xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
diff --git a/experiments/local_linear_examples/friedman_table.R b/experiments/local_linear_examples/friedman_table.R
index 6d9652655..5d62c69cd 100644
--- a/experiments/local_linear_examples/friedman_table.R
+++ b/experiments/local_linear_examples/friedman_table.R
@@ -72,7 +72,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees =
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    boost.cv.fit = cvboost(as.matrix(X), Y)
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
     xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)
     
diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R
index 7a68c0909..a15e4ed3f 100644
--- a/experiments/local_linear_examples/wages.R
+++ b/experiments/local_linear_examples/wages.R
@@ -95,7 +95,7 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){
     bart.preds = bart.mod$yhat.test.mean
     bart.mse = mean((bart.preds-truth)**2)
     
-    boost.cv.fit = cvboost(as.matrix(X), Y)
+    boost.cv.fit = rlearner::cvboost(as.matrix(X), Y)
     xgb.preds = predict(boost.cv.fit, as.matrix(X.test))
     xg.mse = mean((xgb.preds - truth)**2)