From 34857b79b6d6a3a157ebd30674e03ee90e454663 Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Mon, 19 Aug 2019 18:12:34 -0700 Subject: [PATCH 1/6] updated boosting and bart implementations in simulations --- .../local_linear_examples/bias_image.R | 4 +- .../boundary_bias_table.R | 38 +++---------------- .../local_linear_examples/friedman_table.R | 38 +++---------------- 3 files changed, 12 insertions(+), 68 deletions(-) diff --git a/experiments/local_linear_examples/bias_image.R b/experiments/local_linear_examples/bias_image.R index 6dc6cc191..db61ac99e 100644 --- a/experiments/local_linear_examples/bias_image.R +++ b/experiments/local_linear_examples/bias_image.R @@ -17,9 +17,7 @@ ll.forest = local_linear_forest(X, Y) # lasso to select local linear correction variables lasso.mod = cv.glmnet(X, Y, alpha=1) -selected = which(coef(lasso.mod) != 0) -# remove intercept and adjust indexes correspondingly -selected = selected[2:length(selected)] - 1 +selected = as.numeric(predict(lasso.mod, type = "nonzero")) preds.llf = predict(ll.forest, linear.correction.variables = selected, tune.lambda = TRUE)$predictions ticks = seq(-1, 1, length = 2000) diff --git a/experiments/local_linear_examples/boundary_bias_table.R b/experiments/local_linear_examples/boundary_bias_table.R index 9b681fa34..38e364aa8 100644 --- a/experiments/local_linear_examples/boundary_bias_table.R +++ b/experiments/local_linear_examples/boundary_bias_table.R @@ -2,6 +2,7 @@ library(grf) library(glmnet) library(BART) library(xgboost) +library(rlearner) set.seed(1234) @@ -9,27 +10,6 @@ mu = function(x){ log(1+exp(6*x[1])) } -boosting.cv = function(dtrain, Y){ - - # cross-validate on a reasonable grid - etas = seq(0, 0.3, by = 0.1) - nrounds = c(50, 100, 500, 1000) - max_depth = seq(2, 8, by = 2) - - args.cv = expand.grid(e = etas, n = nrounds, d = max_depth) - results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){ - model.xgb = xgboost(dtrain, nrounds = arguments[2], - params = list(objective = "reg:linear"), - eval_metric = "rmse", - eta = arguments[1], - max_depth = arguments[3]) - xgb.preds = predict(model.xgb, newdata = dtrain) - mean((xgb.preds - Y)**2) - })) - best.index = which.min(results) - return(args.cv[best.index,]) -} - simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){ errors = replicate(num.reps, { X = matrix(runif(n*p, -1, 1), nrow = n) @@ -57,7 +37,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){ # use lasso to select linear correction variables lasso.mod = cv.glmnet(X, Y, alpha = 1) - lasso.coef = predict(lasso.mod, type = "nonzero") + lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero")) if(!is.null(dim(lasso.coef))){ selected = lasso.coef[,1] } else { @@ -86,20 +66,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){ lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min") lasso.rf.mse = mean((lasso.rf.preds-truth)**2) - bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5) + bart.mod = wbart(X, Y, X.test) bart.preds = bart.mod$yhat.test.mean bart.mse = mean((bart.preds-truth)**2) - dtrain = xgb.DMatrix(X, label = Y) - cv.variables = boosting.cv(dtrain, Y) - cv.variables = as.numeric(cv.variables) - eta = cv.variables[1] - nrounds = cv.variables[2] - max_depth = cv.variables[3] - - model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"), - eval_metric = "rmse", eta = eta, max_depth = max_depth) - xgb.preds = predict(model.xgb, newdata = X.test) + boost.cv.fit = cvboost(as.matrix(X), Y) + xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) xg.mse = mean((xgb.preds - truth)**2) c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse) diff --git a/experiments/local_linear_examples/friedman_table.R b/experiments/local_linear_examples/friedman_table.R index 5ba3f9913..6d9652655 100644 --- a/experiments/local_linear_examples/friedman_table.R +++ b/experiments/local_linear_examples/friedman_table.R @@ -2,6 +2,7 @@ library(grf) library(glmnet) library(BART) library(xgboost) +library(rlearner) set.seed(1234) @@ -9,27 +10,6 @@ ff = function(x){ return(10*sin(pi*x[1]*x[2]) + 20*((x[3] - 0.5)**2) + 10*x[4] + 5*x[5]) } -boosting.cv = function(dtrain, Y){ - - # cross-validate on a reasonable grid - etas = seq(0, 0.3, by = 0.1) - nrounds = c(50, 100, 500, 1000) - max_depth = seq(2, 8, by = 2) - - args.cv = expand.grid(e = etas, n = nrounds, d = max_depth) - results = t(apply(args.cv, MARGIN = 1, FUN = function(arguments){ - model.xgb = xgboost(dtrain, nrounds = arguments[2], - params = list(objective = "reg:linear"), - eval_metric = "rmse", - eta = arguments[1], - max_depth = arguments[3]) - xgb.preds = predict(model.xgb, newdata = dtrain) - mean((xgb.preds - Y)**2) - })) - best.index = which.min(results) - return(args.cv[best.index,]) -} - simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = 2000){ errors = replicate(num.reps, { X = matrix(runif(n*p,0,1), nrow = n) @@ -58,7 +38,8 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = # use lasso to select linear correction variables lasso.mod = cv.glmnet(X, Y, alpha = 1) - lasso.coef = predict(lasso.mod, type = "nonzero") + lasso.coef = as.numeric(predict(lasso.mod, type = "nonzero")) + # if lasso chose no variables, use all for LL correction if(!is.null(dim(lasso.coef))){ selected = lasso.coef[,1] } else { @@ -87,19 +68,12 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = lasso.rf.preds = predict(rf, newdata = X.test)$predictions + predict(lasso.mod, newx = X.test, s = "lambda.min") lasso.rf.mse = mean((lasso.rf.preds-truth)**2) - bart.mod = wbart(X, Y, X.test, nskip = 5, ndpost = 5) + bart.mod = wbart(X, Y, X.test) bart.preds = bart.mod$yhat.test.mean bart.mse = mean((bart.preds-truth)**2) - dtrain = xgb.DMatrix(X, label = Y) - cv.variables = as.numeric(boosting.cv(dtrain, Y)) - eta = cv.variables[1] - nrounds = cv.variables[2] - max_depth = cv.variables[3] - - model.xgb = xgboost(dtrain, nrounds = nrounds, params = list(objective = "reg:linear"), - eval_metric = "rmse", eta = eta, max_depth = max_depth) - xgb.preds = predict(model.xgb, newdata = X.test) + boost.cv.fit = cvboost(as.matrix(X), Y) + xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) xg.mse = mean((xgb.preds - truth)**2) c(llf.stepwise.mse, llf.oracle, rf.mse, rf.adapt.mse, lasso.rf.mse, bart.mse, xg.mse, llf.lasso.mse) From 8ef66ed3b5d6cd8d6e265e87c2c7544b9dbd1669 Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Mon, 19 Aug 2019 18:20:34 -0700 Subject: [PATCH 2/6] added wages simulation code --- experiments/local_linear_examples/wages.R | 228 ++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 experiments/local_linear_examples/wages.R diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R new file mode 100644 index 000000000..8aa1030ee --- /dev/null +++ b/experiments/local_linear_examples/wages.R @@ -0,0 +1,228 @@ +rm(list = ls()) +set.seed(123) + +library(grf) +library(glmnet) +library(BART) +library(xgboost) +library(ggplot2) +library(dplyr) +library(splines) + +# load the data +load("cps1976_2018.RData") +data = data.frame(cps1976_2018) + +# clean data: remove NA and missingg outcomes +data = data[!is.na(data$incwage),] +data = data[data$incwage != 9999999,] # remove missing +data = data[data$incwage != 9999998,] # remove missing + +# extract 2018 data +data = data[data$year == 2018,] + +# add age^2, educ^2 covariates +data$agesq = data$age**2 +data$educsq = data$educ**2 + +covariates = c("age", "agesq", "educ", "educsq", "occ2010", "occ10ly", "sex", "race", + "marst", "labforce", "ind1950", "classwkr", "wkstat", "uhrswork1", "metro", "famsize") + +continuous.covariates = which(covariates %in% c("age", "educ", "agesq", "educsq", "uhrswork1", "famsize")) +outcome = "incwage" + +data = data[,c(covariates, outcome)] +data = data[complete.cases(data),] + +# transform outcome (standard for wage regressions) +data$incwage = log(data$incwage + 1) + +###################### +## ERROR EVALUATION ## +###################### + +num.reps = 100 +size.test = 10000 +sample.sizes = c(2000, 5000, 10000, 50000) + +mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ + + results = data.frame(t(sapply(1:num.reps, function(i){ + index.train = sample(1:nrow(data), size = size, replace = FALSE) + index.test = sample(1:nrow(data), size = size.test, replace = FALSE) + + X = data[index.train, covariates] + Y = data$incwage[index.train] + + X = X[complete.cases(X),] + Y = Y[complete.cases(X)] + + forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE) + + X.test = data[index.test, covariates] + truth = data$incwage[index.test] + + X.test = X.test[complete.cases(X.test),] + truth = truth[complete.cases(X.test)] + + ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates, + ll.weight.penalty = TRUE)$lambda.min + llf.lasso.preds = predict(forest, as.matrix(X.test), + linear.correction.variables = continuous.covariates, + ll.lambda = ll.lambda, + ll.weight.penalty = TRUE)$predictions + llf.mse = mean((llf.lasso.preds - truth)**2) + + rf.preds = predict(forest, as.matrix(X.test))$predictions + rf.mse = mean((rf.preds - truth)**2) + + ols.form = as.formula(paste("Y", paste(covariates, collapse = "+"), sep = "~")) + dd.ols = cbind(Y, X) + ols.fit = lm(ols.form, dd.ols) + ols.preds = predict(ols.fit, X.test) + ols.mse = mean((ols.preds - truth)**2) + + mm = model.matrix( ~.^2, data = X) + lasso.mod = cv.glmnet(mm, Y, alpha = 1) + mmtest = model.matrix( ~.^2, data = X.test) + lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min) + lasso.mse = mean((lasso.preds - truth)**2) + + bart.mod = wbart(X, Y, X.test) + bart.preds = bart.mod$yhat.test.mean + bart.mse = mean((bart.preds-truth)**2) + + boost.cv.fit = cvboost(as.matrix(X), Y) + xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) + xg.mse = mean((xgb.preds - truth)**2) + + return(c(ols.mse, llf.mse, rf.mse, lasso.mse, xg.mse, bart.mse)) + }))) + + mses = colMeans(results) + sds = apply(results, MARGIN = 2, FUN = sd) + + as.numeric(c(mses, sds)) +}))) + +colnames(mse.sample.sizes) = c("OLS", "Lasso", "RF", "LLF", "XG", "BART", + "OLS.sd", "Lasso.sd", "RF.sd", "LLF.sd", "XG.sd", "BART.sd") + +mse.sample.sizes$size = sample.sizes + +write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE) + +############################ +## CALIBRATION AND IMAGES ## +############################ + +train.index = sample(nrow(data), size = floor(nrow(data)/2)) +train = data[train.index,] +test = data[-train.index,] + +subsets = c("age", "race", "educ", "famsize") +range.dummies = c("age.r", "race.r", "educ.r", "famsize.r") +test$age.r = test$age < 20 | test$age > 85 +test$race.r = test$race > 200 +test$educ.r = test$educ > 111 +test$famsize.r = test$famsize > 6 + +methods = c("lasso", "llf", "ols", "rf") + +forest.fit = regression_forest(train[,covariates], train$incwage, tune.parameters = T) +ols.mod = lm(incwage ~ ., data = train[,c(covariates, "incwage")]) +mm = model.matrix( ~.^2, data = train[,covariates]) +lasso.mod = cv.glmnet(mm, train$incwage, alpha = 1) + +# generate plots for sparse regions and find corresponding t-statistics +t_statistics = data.frame(t(sapply(1:length(subsets), function(i){ + + test.subset = subsets[i] + range.subset = range.dummies[i] + testdata = test[test[,range.subset] == T,] + + names = sapply(methods, function(method){ + paste(paste(method, test.subset, sep = "_"), "jpeg", sep = ".") + }) + + rf.preds = predict(forest.fit, testdata[,covariates])$predictions + + tl = tune_ll_regression_forest(forest.fit, linear.correction.variables = continuous.covariates, + ll.weight.penalty = TRUE)$lambda.min + llf.preds = predict(forest.fit, testdata[,covariates], linear.correction.variables = continuous.covariates, + ll.lambda = tl, ll.weight.penalty = TRUE)$predictions + + ols.preds = predict(ols.mod, newdata = testdata) + + mmtest = model.matrix( ~.^2, data=testdata[,covariates]) + lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min) + + dt = data.frame("Wages" = testdata$incwage, + "LLF" = llf.preds, + "RF" = rf.preds, + "OLS" = ols.preds, + "Lasso" = lasso.preds) + colnames(dt)[5] = "Lasso" + + spline.fit = lm(dt$Wages ~ ns(dt$Lasso, df = 3)) + dt$spline = spline.fit$fitted.values + ggplot(dt, aes(x = Lasso, y = Wages)) + geom_point(cex = 1) + + ylab("Observed log wages") + xlab("Predicted log wages") + + geom_line(aes(x = Lasso, y = spline), color = "red", lty = 1, lwd = 1.5) + + geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + + theme_bw() + + theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + + xlim(c(range(dt$Lasso, dt$Wages))) + ylim(c(range(dt$Lasso, dt$Wages))) + ggsave(names[1]) + + spline.fit = lm(dt$Wages ~ ns(dt$LLF, df = 3)) + dt$spline = spline.fit$fitted.values + ggplot(dt, aes(x = LLF, y = Wages)) + geom_point(cex = 1) + + ylab("Observed log wages") + xlab("Predicted log wages") + + geom_line(aes(x = LLF, y = spline), color = "red", lty = 1, lwd = 1.5) + + geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + + theme_bw() + + theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + + xlim(c(range(dt$LLF, dt$Wages))) + ylim(c(range(dt$LLF, dt$Wages))) + ggsave(names[2]) + + spline.fit = lm(dt$Wages ~ ns(dt$RF, df = 3)) + dt$spline = spline.fit$fitted.values + ggplot(dt, aes(x = RF, y = Wages)) + geom_point(cex = 1) + + ylab("Observed log wages") + xlab("Predicted log wages") + + geom_line(aes(x = RF, y = spline), color = "red", lty = 1, lwd = 1.5) + + geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + + theme_bw() + + theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + + xlim(c(range(dt$RF, dt$Wages))) + ylim(c(range(dt$RF, dt$Wages))) + ggsave(names[3]) + + spline.fit = lm(dt$Wages ~ ns(dt$OLS, df = 3)) + dt$spline = spline.fit$fitted.values + ggplot(dt, aes(x = OLS, y = Wages)) + geom_point(cex = 1) + + ylab("Observed log wages") + xlab("Predicted log wages") + + geom_line(aes(x = OLS, y = spline), color = "red", lty = 1, lwd = 1.5) + + geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + + theme_bw() + + theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + + xlim(c(range(dt$OLS, dt$Wages))) + ylim(c(range(dt$OLS, dt$Wages))) + ggsave(names[4]) + + # squared differences: + ols.sds = (ols.preds - testdata$incwage)**2 + llf.sds = (llf.preds- testdata$incwage)**2 + lasso.sds = (lasso.preds - testdata$incwage)**2 + rf.sds = (rf.preds - testdata$incwage)**2 + + # corresponding paired t-statistics: + t.ols = mean(ols.sds - llf.sds)/(sd(ols.sds - llf.sds)/sqrt(nrow(testdata))) + t.lasso = mean(lasso.sds - llf.sds)/(sd(lasso.sds - llf.sds)/sqrt(nrow(testdata))) + t.rf = mean(rf.sds - llf.sds)/(sd(rf.sds - llf.sds)/sqrt(nrow(testdata))) + + c(t.ols, t.lasso, t.rf) +}))) + +colnames(t_statistics) = c("OLS", "Lasso", "RF") +rownames(t_statistics) = subsets + +write.csv(t_statistics,"t_statistics.csv", row.names = TRUE) From a454f5c6819a10d2dc07c12eaa7f2ad32428f872 Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Mon, 19 Aug 2019 18:24:00 -0700 Subject: [PATCH 3/6] edited wages code for clarity --- experiments/local_linear_examples/wages.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R index 8aa1030ee..8b3d8c286 100644 --- a/experiments/local_linear_examples/wages.R +++ b/experiments/local_linear_examples/wages.R @@ -8,6 +8,7 @@ library(xgboost) library(ggplot2) library(dplyr) library(splines) +library(rlearner) # load the data load("cps1976_2018.RData") @@ -49,7 +50,7 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ results = data.frame(t(sapply(1:num.reps, function(i){ index.train = sample(1:nrow(data), size = size, replace = FALSE) - index.test = sample(1:nrow(data), size = size.test, replace = FALSE) + index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE) X = data[index.train, covariates] Y = data$incwage[index.train] From 8bced9724a857ed1ed3dd1de33d56b9dee1ee182 Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Mon, 19 Aug 2019 18:24:53 -0700 Subject: [PATCH 4/6] added wages call in main file --- experiments/local_linear_examples/main.R | 1 + 1 file changed, 1 insertion(+) diff --git a/experiments/local_linear_examples/main.R b/experiments/local_linear_examples/main.R index 5ec784c0f..63354f524 100644 --- a/experiments/local_linear_examples/main.R +++ b/experiments/local_linear_examples/main.R @@ -10,3 +10,4 @@ source("friedman_table.R") source("confidence.R") source("causal_table.R") source("boundary_bias_table.R") +source("wages.R") \ No newline at end of file From 206e68be3c8995121947368985aa77a3fe17453f Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Wed, 21 Aug 2019 11:35:52 -0700 Subject: [PATCH 5/6] udpated wages code to reflect new train test split --- experiments/local_linear_examples/wages.R | 151 +++------------------- 1 file changed, 19 insertions(+), 132 deletions(-) diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R index 8b3d8c286..7a68c0909 100644 --- a/experiments/local_linear_examples/wages.R +++ b/experiments/local_linear_examples/wages.R @@ -47,18 +47,18 @@ size.test = 10000 sample.sizes = c(2000, 5000, 10000, 50000) mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ - + index.train = sample(1:nrow(data), size = size, replace = FALSE) + + X = data[index.train, covariates] + Y = data$incwage[index.train] + + X = X[complete.cases(X),] + Y = Y[complete.cases(X)] + results = data.frame(t(sapply(1:num.reps, function(i){ - index.train = sample(1:nrow(data), size = size, replace = FALSE) - index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE) + print(i) - X = data[index.train, covariates] - Y = data$incwage[index.train] - - X = X[complete.cases(X),] - Y = Y[complete.cases(X)] - - forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE) + index.test = sample((1:nrow(data))[-index.train], size = size.test, replace = FALSE) X.test = data[index.test, covariates] truth = data$incwage[index.test] @@ -66,13 +66,15 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ X.test = X.test[complete.cases(X.test),] truth = truth[complete.cases(X.test)] + forest = regression_forest(as.matrix(X), Y, honesty = TRUE, tune.parameters = TRUE) + ll.lambda = tune_ll_regression_forest(forest, linear.correction.variables = continuous.covariates, - ll.weight.penalty = TRUE)$lambda.min - llf.lasso.preds = predict(forest, as.matrix(X.test), + ll.weight.penalty = T)$lambda.min + llf.preds = predict(forest, as.matrix(X.test), linear.correction.variables = continuous.covariates, ll.lambda = ll.lambda, - ll.weight.penalty = TRUE)$predictions - llf.mse = mean((llf.lasso.preds - truth)**2) + ll.weight.penalty = T)$predictions + llf.mse = mean((llf.preds - truth)**2) rf.preds = predict(forest, as.matrix(X.test))$predictions rf.mse = mean((rf.preds - truth)**2) @@ -106,124 +108,9 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ as.numeric(c(mses, sds)) }))) -colnames(mse.sample.sizes) = c("OLS", "Lasso", "RF", "LLF", "XG", "BART", - "OLS.sd", "Lasso.sd", "RF.sd", "LLF.sd", "XG.sd", "BART.sd") +colnames(mse.sample.sizes) = c("OLS", "LLF", "RF", "Lasso", "XG", "BART", + "OLS.sd", "LLF.sd", "RF.sd", "Lasso.sd", "XG.sd", "BART.sd") mse.sample.sizes$size = sample.sizes -write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE) - -############################ -## CALIBRATION AND IMAGES ## -############################ - -train.index = sample(nrow(data), size = floor(nrow(data)/2)) -train = data[train.index,] -test = data[-train.index,] - -subsets = c("age", "race", "educ", "famsize") -range.dummies = c("age.r", "race.r", "educ.r", "famsize.r") -test$age.r = test$age < 20 | test$age > 85 -test$race.r = test$race > 200 -test$educ.r = test$educ > 111 -test$famsize.r = test$famsize > 6 - -methods = c("lasso", "llf", "ols", "rf") - -forest.fit = regression_forest(train[,covariates], train$incwage, tune.parameters = T) -ols.mod = lm(incwage ~ ., data = train[,c(covariates, "incwage")]) -mm = model.matrix( ~.^2, data = train[,covariates]) -lasso.mod = cv.glmnet(mm, train$incwage, alpha = 1) - -# generate plots for sparse regions and find corresponding t-statistics -t_statistics = data.frame(t(sapply(1:length(subsets), function(i){ - - test.subset = subsets[i] - range.subset = range.dummies[i] - testdata = test[test[,range.subset] == T,] - - names = sapply(methods, function(method){ - paste(paste(method, test.subset, sep = "_"), "jpeg", sep = ".") - }) - - rf.preds = predict(forest.fit, testdata[,covariates])$predictions - - tl = tune_ll_regression_forest(forest.fit, linear.correction.variables = continuous.covariates, - ll.weight.penalty = TRUE)$lambda.min - llf.preds = predict(forest.fit, testdata[,covariates], linear.correction.variables = continuous.covariates, - ll.lambda = tl, ll.weight.penalty = TRUE)$predictions - - ols.preds = predict(ols.mod, newdata = testdata) - - mmtest = model.matrix( ~.^2, data=testdata[,covariates]) - lasso.preds = predict(lasso.mod, newx = mmtest, lambda= lasso.mod$lambda.min) - - dt = data.frame("Wages" = testdata$incwage, - "LLF" = llf.preds, - "RF" = rf.preds, - "OLS" = ols.preds, - "Lasso" = lasso.preds) - colnames(dt)[5] = "Lasso" - - spline.fit = lm(dt$Wages ~ ns(dt$Lasso, df = 3)) - dt$spline = spline.fit$fitted.values - ggplot(dt, aes(x = Lasso, y = Wages)) + geom_point(cex = 1) + - ylab("Observed log wages") + xlab("Predicted log wages") + - geom_line(aes(x = Lasso, y = spline), color = "red", lty = 1, lwd = 1.5) + - geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + - theme_bw() + - theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + - xlim(c(range(dt$Lasso, dt$Wages))) + ylim(c(range(dt$Lasso, dt$Wages))) - ggsave(names[1]) - - spline.fit = lm(dt$Wages ~ ns(dt$LLF, df = 3)) - dt$spline = spline.fit$fitted.values - ggplot(dt, aes(x = LLF, y = Wages)) + geom_point(cex = 1) + - ylab("Observed log wages") + xlab("Predicted log wages") + - geom_line(aes(x = LLF, y = spline), color = "red", lty = 1, lwd = 1.5) + - geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + - theme_bw() + - theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + - xlim(c(range(dt$LLF, dt$Wages))) + ylim(c(range(dt$LLF, dt$Wages))) - ggsave(names[2]) - - spline.fit = lm(dt$Wages ~ ns(dt$RF, df = 3)) - dt$spline = spline.fit$fitted.values - ggplot(dt, aes(x = RF, y = Wages)) + geom_point(cex = 1) + - ylab("Observed log wages") + xlab("Predicted log wages") + - geom_line(aes(x = RF, y = spline), color = "red", lty = 1, lwd = 1.5) + - geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + - theme_bw() + - theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + - xlim(c(range(dt$RF, dt$Wages))) + ylim(c(range(dt$RF, dt$Wages))) - ggsave(names[3]) - - spline.fit = lm(dt$Wages ~ ns(dt$OLS, df = 3)) - dt$spline = spline.fit$fitted.values - ggplot(dt, aes(x = OLS, y = Wages)) + geom_point(cex = 1) + - ylab("Observed log wages") + xlab("Predicted log wages") + - geom_line(aes(x = OLS, y = spline), color = "red", lty = 1, lwd = 1.5) + - geom_abline(color = "Darkgray", intercept = 0, slope = 1, lty = 2, lwd = 1.5) + - theme_bw() + - theme(axis.text=element_text(size=16),axis.title=element_text(size=16)) + - xlim(c(range(dt$OLS, dt$Wages))) + ylim(c(range(dt$OLS, dt$Wages))) - ggsave(names[4]) - - # squared differences: - ols.sds = (ols.preds - testdata$incwage)**2 - llf.sds = (llf.preds- testdata$incwage)**2 - lasso.sds = (lasso.preds - testdata$incwage)**2 - rf.sds = (rf.preds - testdata$incwage)**2 - - # corresponding paired t-statistics: - t.ols = mean(ols.sds - llf.sds)/(sd(ols.sds - llf.sds)/sqrt(nrow(testdata))) - t.lasso = mean(lasso.sds - llf.sds)/(sd(lasso.sds - llf.sds)/sqrt(nrow(testdata))) - t.rf = mean(rf.sds - llf.sds)/(sd(rf.sds - llf.sds)/sqrt(nrow(testdata))) - - c(t.ols, t.lasso, t.rf) -}))) - -colnames(t_statistics) = c("OLS", "Lasso", "RF") -rownames(t_statistics) = subsets - -write.csv(t_statistics,"t_statistics.csv", row.names = TRUE) +write.csv(mse.sample.sizes,"wages_sample_sizes.csv", row.names = FALSE) \ No newline at end of file From 819995e4aeb0f1c0f2926e14b61ea8965df25734 Mon Sep 17 00:00:00 2001 From: Rina Siller Friedberg Date: Wed, 21 Aug 2019 14:47:47 -0700 Subject: [PATCH 6/6] added explicit rlearner call for readability --- experiments/local_linear_examples/boundary_bias_table.R | 2 +- experiments/local_linear_examples/friedman_table.R | 2 +- experiments/local_linear_examples/wages.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/experiments/local_linear_examples/boundary_bias_table.R b/experiments/local_linear_examples/boundary_bias_table.R index 38e364aa8..db3e940d7 100644 --- a/experiments/local_linear_examples/boundary_bias_table.R +++ b/experiments/local_linear_examples/boundary_bias_table.R @@ -70,7 +70,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000){ bart.preds = bart.mod$yhat.test.mean bart.mse = mean((bart.preds-truth)**2) - boost.cv.fit = cvboost(as.matrix(X), Y) + boost.cv.fit = rlearner::cvboost(as.matrix(X), Y) xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) xg.mse = mean((xgb.preds - truth)**2) diff --git a/experiments/local_linear_examples/friedman_table.R b/experiments/local_linear_examples/friedman_table.R index 6d9652655..5d62c69cd 100644 --- a/experiments/local_linear_examples/friedman_table.R +++ b/experiments/local_linear_examples/friedman_table.R @@ -72,7 +72,7 @@ simulation.run = function(n, p, sigma, num.reps = 100, ntest = 2000, num.trees = bart.preds = bart.mod$yhat.test.mean bart.mse = mean((bart.preds-truth)**2) - boost.cv.fit = cvboost(as.matrix(X), Y) + boost.cv.fit = rlearner::cvboost(as.matrix(X), Y) xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) xg.mse = mean((xgb.preds - truth)**2) diff --git a/experiments/local_linear_examples/wages.R b/experiments/local_linear_examples/wages.R index 7a68c0909..a15e4ed3f 100644 --- a/experiments/local_linear_examples/wages.R +++ b/experiments/local_linear_examples/wages.R @@ -95,7 +95,7 @@ mse.sample.sizes = data.frame(t(sapply(sample.sizes, function(size){ bart.preds = bart.mod$yhat.test.mean bart.mse = mean((bart.preds-truth)**2) - boost.cv.fit = cvboost(as.matrix(X), Y) + boost.cv.fit = rlearner::cvboost(as.matrix(X), Y) xgb.preds = predict(boost.cv.fit, as.matrix(X.test)) xg.mse = mean((xgb.preds - truth)**2)