-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathMultilevelModelPost.r
135 lines (109 loc) · 5.83 KB
/
MultilevelModelPost.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
library(arm)
library(dplyr)
library(ggplot2)
library(magrittr)
library(lme4)
library(glmnet)
library(stringr)
library(tidyr)
source("multiplot.r")
source("readin.r")
# multiplot.r is a function found online that will plot multiple ggplots in a grid
# readin.r is my own helper function read in data from multiple folders where I store the CFB Stats data
options(dplyr.width = Inf)
### Read in the Data we Need ###
years <- 2014
plays <- readin("play", years)
# The NCAA treats sacks as runs (dumb) and I am choosing to remove those plays from this analysis
runs <- readin("rush", years) %>% filter(Sack == 0)
players <- readin("player", years)
teams <- readin("team", years)
### Augment Runner Info and get basic stats for runners ###
runs <- left_join(runs, select(plays, Game.Code, Play.Number, Offense.Team.Code, Defense.Team.Code, Down, Distance, Spot), by = c("Game.Code", "Play.Number"))
runner_info <- runs %>%
group_by(Player.Code) %>%
summarize(YPA = mean(Yards), Runs = n(), TDs = sum(Touchdown))
### Fit Mixed Effects Model for Yards ###
base_lme_model <- lmer(Yards ~ 1 + (1 | Offense.Team.Code) + (1 | Defense.Team.Code) + (1 | Player.Code), data = runs)
lme_values <- coef(base_lme_model)
lme_se <- se.ranef(base_lme_model)
# lme_values is a list that containes the estimates of the random effects coefficients for each different variable.
# lme_se is the same
### Add estimates of the player's coefficient back into the runner_info ###
runners <- lme_values$Player.Code
runners$Player.Code <- as.numeric(row.names(runners))
names(runners)[1] <- "LME_Value"
runners <- left_join(runners, select(players, Player.Code, Team.Code, Last.Name, First.Name), by = c("Player.Code")) %>%
left_join(teams, by = c("Team.Code")) %>%
left_join(runner_info, by = c("Player.Code"))
### Get estimates of Offense and Defense ###
offenses <- lme_values$Offense.Team.Code
offenses$Team.Code <- as.numeric(row.names(offenses))
names(offenses)[1] <- "LME_O_Value"
teams <- right_join(teams, offenses, by = c("Team.Code"))
defenses <- lme_values$Defense.Team.Code
defenses$Team.Code <- as.numeric(row.names(defenses))
names(defenses)[1] <- "LME_D_Value"
teams <- right_join(teams, defenses, by = c("Team.Code"))
runner_se <- data_frame(LME_SE = unname(lme_se$Player.Code[, 1]), Player.Code = as.numeric(row.names(lme_se$Player.Code)))
runners <- left_join(runners, runner_se, by = c("Player.Code"))
### Fit Ridge Regression Model ###
# The original ridge regression takes forever to run so I simply reduced the duplicate entries to a weighted regression with the weights as the number of times that observation occured. For example multiple times in a game a running back will run for 4 yards against the same defense, so we just count the number of times that happens.
weighted_runs <- runs %>% group_by(Yards, Offense.Team.Code, Defense.Team.Code, Player.Code) %>% tally
weighted_runs$Offense.Team.Code %<>% as.factor
weighted_runs$Defense.Team.Code %<>% as.factor
weighted_runs$Player.Code %<>% as.factor
# We can't use the base model.matrix formula because we want to include every dummy variable, we don't want a baseline built in to the intercept. So I had to use the contrasts.arg trick found on stack overflow.
model_data <- model.matrix(Yards ~ Offense.Team.Code + Defense.Team.Code + Player.Code, data = weighted_runs, contrasts.arg = lapply(weighted_runs[,sapply(weighted_runs,is.factor)], contrasts, contrasts=FALSE))
# The model takes a while so I just load in a saved model for future iterations
# base_rr_model <- glmnet(x = model_data, y = weighted_runs$Yards, weights = weighted_runs$n, alpha = 0)
load("base_rr_model.rdata")
ridge_values <- coef(base_rr_model,s = 8.932324)[,1] # this lambda was found through CV
# These coefficients contain all the estimates for all variables so we need to find the player ones we want
ridge_runners <- data_frame(Runners = names(ridge_values)[str_detect(names(ridge_values), "Player")], Ridge_Value = unname(ridge_values[str_detect(names(ridge_values), "Player")]))
ridge_runners$Player.Code <- as.numeric(str_extract(ridge_runners$Runners, "[0-9]+"))
runner_data <- left_join(runners, select(ridge_runners, -Runners), by = c("Player.Code"))
### Build Plots ###
value_distributions <- select(runner_data, Player.Code, LME_Value, YPA) %>%
gather(Measure, Value, -Player.Code) %>%
mutate(Measure = factor(Measure, labels = c("Mixed Effects Model Estimate", "Yards per Attempt"))) %>%
ggplot(aes(x = Value)) +
geom_histogram(binwidth = 2) +
facet_wrap(~Measure, ncol = 1) +
theme_538 +
theme(strip.text = element_text(size = 16)) +
ggtitle("Distribution of Runner Estimates") +
xlim(-25,25)
num_ypa <- ggplot(runner_data, aes(x = Runs, y = YPA)) +
geom_point(alpha = .6) +
xlab("Number of Rushes") +
ylab("Yards per Attempt") +
ggtitle("Effect of number of runs on YPA") +
theme_538
lmm_ypa <- ggplot(runner_data, aes(LME_Value, YPA)) +
geom_point(aes(size = Runs), alpha = .6) +
xlab("Mixed Effects Model Coefficient Estimates") +
ylab("Yards per Attempt") +
ggtitle("Mixed Effects Model Estimates vs Yards per Attmept") +
theme_538 +
theme(legend.position = c(.75, .25))
off_ypa <- runs %>%
group_by(Offense.Team.Code) %>%
summarize(YPA = mean(Yards)) %>%
left_join(teams, by = c("Offense.Team.Code" = "Team.Code")) %>%
ggplot(aes(x = LME_O_Value, y = YPA)) +
geom_point() +
xlab("Multilevel Model Coefficient Estimate") +
ylab("Yards per Carry") +
ggtitle("Multilevel Model Estimate vs YPC\n For Offenses") +
theme_538
def_ypa <- runs %>%
group_by(Defense.Team.Code) %>%
summarize(YPA = mean(Yards)) %>%
left_join(teams, by = c("Defense.Team.Code" = "Team.Code")) %>%
ggplot(aes(x = LME_D_Value, y = YPA)) +
geom_point() +
xlab("Multilevel Model Coefficient Estimate") +
ylab("Yards per Carry") +
ggtitle("Multilevel Model Estimate vs YPC\n For Defenses") +
theme_538