Bachelor's thesis - data analysis.Rmd

---
title: "UCF Bachelor's Thesis"
subtitle: "Analysis of Coded Data"
output: html_notebook
---

Installing `rtemis`, a library for advanced machine learning which used here for the additive trees, as well as necessary dependencies.

```{r eval=FALSE}
install.packages("remotes")
remotes::install_github("egenn/rtemis")
install.packages(c("e1071", "gbm", "glmnet", "pbapply", "plyr", "ranger", "rpart", "data.tree", "DiagrammeR", "missRanger", "plotly", "DiagrammeRsvg", "rsvg", "svglite", "extrafont")) #Additional dependencies
```


```{r message=FALSE, warning=FALSE}
library(tidyverse)
library(rtemis)
library(DiagrammeR)
#Loaded sparately to export the plots
library(DiagrammeRsvg)
library(rsvg)
```

To use Calibri:

```{r message=FALSE, warning=FALSE}
library(extrafont)
font_import(pattern = "calibri", prompt = FALSE) #Somewhat system-specific
loadfonts(device = "win")
```

# Pilot

Importing the Atlas.ti dataset. (NB: I changed the document number for the second holding of Finogenov to make distinguishing easier)

```{r}
library(readxl)
pilot <- read_excel("pilot coding data.xlsx")
```

Merging the observations spread out over multiple rows into one.

```{r}
pilot2 <- pilot %>% 
  select(D, 3, 13:70) %>% 
  group_by(D) %>% 
  summarise_all(list(~ toString(unique(na.omit(.)))))
```

Getting summary statistics.

```{r}
pilot2 %>% 
  summarise_all(list(~n()))

pilot_subset <- pilot2 %>% 
  select(D, Q49:Q53) %>%  
  pivot_longer(names_to = "var", values_to = "val", cols = c(everything(), -D))

pilot_subset %>% 
  group_by(var, val) %>% 
  summarise_all(list(~length(.)))
```

---

# Tidying and Preparing the Final Datasets

Importing the Excel coded cases dataset:

```{r}
library(readxl)
cases_raw <- read_excel("case codings database.xlsx")
```

Removing the explanation for each level, turning ordinal variables into ordinal factors, and the remaining ones into normal factors:

```{r}
cases <- cases_raw %>% 
  select(gc_dec:l_train) %>% 
  mutate_all(list(~ str_replace(., "\\s=\\s.*", ""))) %>% 
  mutate(
    scrutiny = scrutiny     %>% as.numeric() %>% as.ordered(),
    f_prev = f_prev         %>% as.numeric() %>% as.ordered(),
    f_dispute = f_dispute   %>% as.numeric() %>% as.ordered(),
    uof_deaths = uof_deaths %>% as.numeric() %>% as.ordered(),
    t_who = t_who           %>% as.numeric() %>% as.ordered(),
    t_nature = t_nature     %>% as.numeric() %>% as.ordered(),
    uof_sa = uof_sa         %>% as.numeric() %>% as.ordered(),
    uof_nature = uof_nature %>% as.numeric() %>% as.ordered(),
    o_control = o_control   %>% as.numeric() %>% as.ordered()
  )  %>% 
  mutate_all(list(~ as_factor(.)))

cases <- cbind(select(cases_raw, ID:respondent), cases)
```


Create the datasets for each of our four outcome variables. This involves putting the outcome variable as last column and recoding the factor such that the "positive" outcome (i.e. a violation) is the first level of the factor. Furthermore, the three other outcome variables are turned into binomial variables with one positive (violation) level and one negative (no violation, not related or uncertain) level. 

```{r}
cases.viol <- cases %>% 
  select(scrutiny:l_train, h_viol) %>% 
  mutate(h_viol = factor(h_viol, levels = c(1, 0)))
cases.force <- cases %>%   
  select(scrutiny:l_train, h_force) %>% 
  mutate(h_force = recode(h_force, `0` = 0, `1` = 0, `-88` = 0, `2`= 1)) %>% 
  mutate(h_force = factor(h_force, levels = c(1, 0)))
cases.op <- cases %>% 
  select(scrutiny:l_train, h_operation) %>% 
  mutate(h_operation = recode(h_operation, `0` = 0, `1` = 0, `-88` = 0, `2`= 1)) %>% 
  mutate(h_operation = factor(h_operation, levels = c(1, 0)))
cases.law <- cases %>% 
  select(scrutiny:l_train, h_law) %>% 
  mutate(h_law = recode(h_law, `0` = 0, `1` = 0, `-88` = 0, `2`= 1)) %>% 
  mutate(h_law = factor(h_law, levels = c(1, 0)))
```

# Summary Statistics

Getting a graphic overview over the outcome variables. First, we need to transform the dataset into a long format with the old values and recodes next to each other.

```{r}
casesDV.norecode <- data.frame(
  h_viol = recode(cases$h_viol, `1` = "2", `0` = "0"),
  h_force = cases$h_force,
  h_operation = cases$h_operation,
  h_law = cases$h_law
) %>% 
  mutate(ID = row_number()) %>% 
  pivot_longer(cols = h_viol:h_law, names_to = "obligation", values_to = "decision")

casesDV.recoded <- data.frame(
  h_viol = cases.viol$h_viol,
  h_force = cases.force$h_force,
  h_operation = cases.op$h_operation,
  h_law = cases.law$h_law
) %>% 
  mutate(ID = row_number()) %>% 
  pivot_longer(cols = h_viol:h_law, names_to = "obligation", values_to = "recoded_dec")

casesDV <- left_join(casesDV.norecode, casesDV.recoded) %>% 
  mutate(decision = factor(decision, levels = c("-88", "0", "1", "2"), ordered = TRUE),
         recoded_dec = factor(recoded_dec, levels = c("0", "1"), ordered = TRUE))
```

```{r}

plot.summary.violations <- casesDV %>% 
  #Reordering
  mutate(obligation = factor(obligation, levels = c("h_law", "h_force", "h_operation", "h_viol"))) %>% 
  ggplot() +
  geom_bar(mapping = aes(y = obligation, fill = decision)) +
  geom_bar(mapping = aes(y = obligation, color = recoded_dec), size = 1.3, alpha = 0) +
  scale_color_manual(values = c("#18A3ACDD", "#F48024DD"),
                     labels = c("No violation", "Violation")) + 
  #scale_fill_viridis_d(option = "magma") +
  #scale_fill_brewer(palette = "Purples") +
  scale_fill_grey(labels = c("Uncertain (-88)", "Unrelated (0)", "No violation (1)", "Violation (2)")) +
  scale_y_discrete(labels = c("Framework\n obligation", "Use of force\n obligation", "Operation\n obligation", "Article 2\n decision")) +
  labs(title = "Figure 1",
       subtitle = "Distribution of the Values of the Outcome Variables",
       y = NULL,
       x = "Number of cases",
       color = "Decision tree",
       fill = "Coding scheme") +
  theme_light(base_family = "Calibri",
              base_size = 12) +
  #Making it APA compatible
  theme(plot.title = element_text(size = 12, face = "bold", 
                                  margin = margin(b = 12)),
        plot.subtitle = element_text(face =  "italic", 
                                     margin = margin(b = 12)),
        axis.title.x = element_text(margin = margin(t = 12)),
        axis.title.y = element_text(margin = margin(r = 12)))

ggsave(plot = plot.summary.violations, 
       "figures/figure summary-violations-obligations.png", 
       width = 160, height = 120, units = "mm")
```


# Using Decision Trees

## Training the Models

Perform a 10-fold nested cross-validation for hyperparameter tuning (model selection) and model assessment. A 10-fold cross-validation is performed to test model generalizability. Within each resample, another 10-fold cross-validation is performed to tune the gamma hyperparameter between four different options. In total, 1600 models are thus run. 

```{r eval=FALSE}
cases.viol.tree <- elevate(cases.viol, mod = "addtree",
                           resampler = "kfold", n.resamples = 10,
                           grid.resample.rtset = rtset.resample("kfold", 10),
                           gamma = c(0.1, 0.3, 0.5, 0.7, 0.9),
                           learning.rate = 0.001,
                           seed = 2020)
saveRDS(cases.viol.tree, "models/cases-viol-tree.rds")

cases.force.tree <- elevate(cases.force, mod = "addtree",
                           resampler = "kfold", n.resamples = 10,
                           grid.resample.rtset = rtset.resample("kfold", 10),
                           gamma = c(0.1, 0.3, 0.5, 0.7, 0.9),
                           learning.rate = 0.001,
                           seed = 2020)
saveRDS(cases.force.tree, "models/cases-force-tree.rds")

cases.op.tree <- elevate(cases.op, mod = "addtree",
                           resampler = "kfold", n.resamples = 10,
                           grid.resample.rtset = rtset.resample("kfold", 10),
                           gamma = c(0.1, 0.3, 0.5, 0.7, 0.9),
                           learning.rate = 0.001,
                           seed = 2020)
saveRDS(cases.op.tree, "models/cases-op-tree.rds")

cases.law.tree <- elevate(cases.law, mod = "addtree",
                           resampler = "kfold", n.resamples = 10,
                           grid.resample.rtset = rtset.resample("kfold", 10),
                           gamma = c(0.1, 0.3, 0.5, 0.7, 0.9),
                           learning.rate = 0.001,
                           seed = 2020)
saveRDS(cases.law.tree, "models/cases-law-tree.rds")

```

Read the AddTrees in from the saved file because re-executing the model run for knitting is too computationally expensive:

```{r}
cases.viol.tree <- readRDS("models/cases-viol-tree.rds")
cases.force.tree <- readRDS("models/cases-force-tree.rds")
cases.op.tree <- readRDS("models/cases-op-tree.rds")
cases.law.tree <- readRDS("models/cases-law-tree.rds")
```

Getting some visual summaries for the model assessment: 

```{r}
cases.viol.tree$plot(filename = "graphs/cases-viol-tree-plot.pdf")
cases.force.tree$plot(filename = "graphs/cases-force-tree-plot.pdf")
cases.op.tree$plot(filename = "graphs/cases-op-tree-plot.pdf")
cases.law.tree$plot(filename = "graphs/cases-law-tree-plot.pdf")
```

```{r}
message("h_viol model:")
cases.viol.tree$describe()
message("\n h_force model:")
cases.force.tree$describe()
message("\n h_operation model:")
cases.op.tree$describe()
message("\n h_law model:")
cases.law.tree$describe()
```

## Model Assessment and Results of Model Selection

After running the models, we would now like to extract the respective information. This encompasses the values in the list below, which includes the  aggregated values of the evaluation measures and the hyperparameters picked.

* Balanced accuracy
* Sensitivity (recall)
* Specificity
* Precision (positive predictive value)
* Negative predictive value
* Gamma ($\gamma = \frac{\lambda}{1 + \lambda}$)

### Calculating the Balanced Accuracy Confidence Interval

To calculate the **confidence interval (CI) for sensitivity and specificity**, I am simply using `binom.test`, Because sensitivity and specificity can be seen as a success rate of a number of independent trials, they can be modeled by the binomial distribution and, therefore, we may take any method used to calculate the binomial proportion confidence interval here, including the Clopper-Pearson method used in `binom.test`.

To calculate the **confidence interval of balanced accuracy (BACC)**, we may approximate it using a normal confidence interval with a logit transformation because it is an aggregate proportion value. The general formula for a confidence interval is $(\mathrm{CI_{lower}}, \mathrm{CI_{upper}}) = \bar X \pm z * \frac{\sigma}{\sqrt{n}}$ with $\bar X$ being the mean (or other value we want to calculate the CI from), $z$ being the critical value of the distribution's quantile function for a certain probability (e.g. $0.975$ (one-sided) for a 95% CI (two-sided)), $\sigma$ being the standard deviation, and $n$ the sample size.

The variances of sensitivity and specificity, as if they would be normally distributed, are first extracted from their respective CIs and then used to calculate the BACC variance following the approach to calculating the variance of an average: $\mathrm{Var}(\bar X) = \left(\frac{1}{n} \right) ^2 \left( \mathrm{Var}(X_1) + \mathrm{Var}(X_2) + \dots + \mathrm{Var}(X_n) \right)$. With logit defined as $\mathrm{logit}(p) = \mathrm{ln} \left( \frac{p}{1-p} \right)$, we can [approximate](https://stats.stackexchange.com/a/119770/206695) it using the normal distribution: 

$$
\ln\frac{p}{1-p} \sim_{\text{approx.}} N\left(\ln\frac{p}{1-p}, \frac 1{np(1-p)}\right)
$$

Here, $N$ is the normal distribution and the first value is the mean and the second value is the variance.

Henceforth, the variance is calculated with
$$
\mathrm{Var(\bar X)} = \left(\frac{1}{np(1-p)} \right) ^2 \left( \mathrm{Var}(X_1) + \mathrm{Var}(X_2) + \dots + \mathrm{Var}(X_n) \right)
$$

Applied to our case, the formula for the variance is

$$
\mathrm{Var(bacc)} = \left(\frac{1}{np(1-p)} \right) ^2 \left( \mathrm{Var}(\mathrm{sens}) + \mathrm{Var}(\mathrm{spec}) \right)
$$

Thus, logit-transformed distribution of BACC is 
$$
\mathrm{logit}(\widehat{\mathrm{bacc}})\sim N\left(\mathrm{bacc}, \frac{\sigma_{\mathrm{sens}}^2+\sigma_{\mathrm{spec}}^2}{4\mathrm{bacc}^2(1-\mathrm{bacc})^2}\right)
$$

Henceforth, the **formula for the BACC CI** is the following:

$$
(\mathrm{CI_{lower}}, \mathrm{CI_{upper}}) = \mathrm{logit}^{-1} \left( \mathrm{logit}(\mathrm{bacc}) \pm  z * \frac{\mathrm{logit}( \sigma_{\mathrm{bacc}} )}{\sqrt n} \right)
$$

, whereby the following applies:

$$
\mathrm{logit}(\sigma_{\mathrm{bacc}}) = \sqrt{ \frac{ \sigma_{\mathrm{sens}}^2 + \sigma_{\mathrm{spec}}^2 }{ 4 \mathrm{bacc}^2 (1 - \mathrm{bacc})^2 } }
$$

$$
\sigma_{\mathrm{sens}}^2 = \left(  \frac{ \mathrm{CI_{sens_{lower}}} - \mathrm{CI_{sens_{upper}}} }{ 2 * 1.96 } \right)^2
$$

$$
\sigma_{\mathrm{spec}}^2 = \left(  \frac{ \mathrm{CI_{spec_{lower}}} - \mathrm{CI_{spec_{upper}}} }{ 2 * 1.96 } \right)^2
$$

with $\mathrm{logit}^{-1}$ being the inverse logit function (also known as expit), $\sigma_{\mathrm{sens}}^2$ and $\sigma_{\mathrm{spec}}^2$ being the variance for sensitivity and specificity respectively, and $1.96$ being the critical $z$-value for a 95% confidence interval.

### Extracting and Calculating the Metrics

For each model, get the measures as well as their confidence intervals: 

```{r}
cases.viol.tree$error.test.repeats
data.frame("sensitivity_ci" = binom.test(53, 61)$conf.int, #Couldn't find way to extract values
           "specificity_ci" = binom.test(13, 16)$conf.int)  %>% 
  #Calculate BACC CI by using normal distribution with logit transformation (see above)
  mutate(lower_ci = row_number(sensitivity_ci) %% 2 == 1) %>% 
  mutate(
    sd_logit = sqrt(
      ( ( (first(sensitivity_ci) - last(sensitivity_ci)) / (2 * 1.96) )^2 +
         ((first(specificity_ci) - last(specificity_ci)) / (2 * 1.96))^2 )
      / (4 * cases.viol.tree$error.test.repeats$`Balanced Accuracy`^2 * 
         (1 - cases.viol.tree$error.test.repeats$`Balanced Accuracy`)^2) ),
    bacc_err = 1.96 * sd_logit / sqrt(77), 
    bacc_ci = 
      ifelse((lower_ci == 1), 
             invlogit(logit(cases.viol.tree$error.test.repeats$`Balanced Accuracy`) - bacc_err),
             invlogit(logit(cases.viol.tree$error.test.repeats$`Balanced Accuracy`) + bacc_err))) %>% 
  select(-bacc_err, -sd_logit)
```

```{r}
cases.force.tree$error.test.repeats
data.frame("sensitivity_ci" = binom.test(28, 38)$conf.int, #Couldn't find way to extract values
           "specificity_ci" = binom.test(27, 39)$conf.int) %>% 
  #Calculate BACC CI by using normal distribution with logit transformation (see above)
  mutate(lower_ci = row_number(sensitivity_ci) %% 2 == 1) %>% 
  mutate(
    sd_logit = sqrt(
      ( ( (first(sensitivity_ci) - last(sensitivity_ci)) / (2 * 1.96) )^2 +
         ((first(specificity_ci) - last(specificity_ci)) / (2 * 1.96))^2 )
      / (4 * cases.force.tree$error.test.repeats$`Balanced Accuracy`^2 * 
         (1 - cases.force.tree$error.test.repeats$`Balanced Accuracy`)^2) ),
    bacc_err = 1.96 * sd_logit / sqrt(77), 
    bacc_ci = 
      ifelse((lower_ci == 1), 
             invlogit(logit(cases.force.tree$error.test.repeats$`Balanced Accuracy`) - bacc_err),
             invlogit(logit(cases.force.tree$error.test.repeats$`Balanced Accuracy`) + bacc_err))) %>% 
  select(-bacc_err, -sd_logit)
```

```{r}           
cases.op.tree$error.test.repeats
data.frame("sensitivity_ci" = binom.test(34, 43)$conf.int, #Couldn't find way to extract values
           "specificity_ci" = binom.test(29, 34)$conf.int) %>% 
  #Calculate BACC CI by using normal distribution with logit transformation (see above)
  mutate(lower_ci = row_number(sensitivity_ci) %% 2 == 1) %>% 
  mutate(
    sd_logit = sqrt(
      ( ( (first(sensitivity_ci) - last(sensitivity_ci)) / (2 * 1.96) )^2 +
         ((first(specificity_ci) - last(specificity_ci)) / (2 * 1.96))^2 )
      / (4 * cases.op.tree$error.test.repeats$`Balanced Accuracy`^2 * 
         (1 - cases.op.tree$error.test.repeats$`Balanced Accuracy`)^2) ),
    bacc_err = 1.96 * sd_logit / sqrt(77), 
    bacc_ci = 
      ifelse((lower_ci == 1), 
             invlogit(logit(cases.op.tree$error.test.repeats$`Balanced Accuracy`) - bacc_err),
             invlogit(logit(cases.op.tree$error.test.repeats$`Balanced Accuracy`) + bacc_err))) %>% 
  select(-bacc_err, -sd_logit)
```

```{r}
cases.law.tree$error.test.repeats
data.frame("sensitivity_ci" = binom.test(11, 14)$conf.int, #Couldn't find way to extract values
           "specificity_ci" = binom.test(59, 63)$conf.int) %>% 
  #Calculate BACC CI by using normal distribution with logit transformation (see above)
  mutate(lower_ci = row_number(sensitivity_ci) %% 2 == 1) %>% 
  mutate(
    sd_logit = sqrt(
      ( ( (first(sensitivity_ci) - last(sensitivity_ci)) / (2 * 1.96) )^2 +
         ((first(specificity_ci) - last(specificity_ci)) / (2 * 1.96))^2 )
      / (4 * cases.law.tree$error.test.repeats$`Balanced Accuracy`^2 * 
         (1 - cases.law.tree$error.test.repeats$`Balanced Accuracy`)^2) ),
    bacc_err = 1.96 * sd_logit / sqrt(77), 
    bacc_ci = 
      ifelse((lower_ci == 1), 
             invlogit(logit(cases.law.tree$error.test.repeats$`Balanced Accuracy`) - bacc_err),
             invlogit(logit(cases.law.tree$error.test.repeats$`Balanced Accuracy`) + bacc_err))) %>% 
  select(-bacc_err, -sd_logit)
```

Side note: The difference between `tree$error.test.res.aggr`and `tree$error.test.res.mean`: Latter just takes the mean of the results of the 10 repeats while former aggregates the specific counts into a confusion matrix and then calculates the evaluation measures. The (theoretical) difference between `tree$error.test.repeats` and `tree$error.test.res.aggr`: Former relates to the repeats (default: `n.repeats` = 1) of the resampling process (summarizing those values with `... .mean` and `... .sd`) while `error.test.res` concerns the resamples in each repeat with `aggr` listing the aggregate values within each repeat (which is why they have the `$elevate.ADDTREE.repeat1` substructure) .

Extracting the best hyperparameters:

```{r}
cases.viol.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% select(gamma) %>% table()
cases.force.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% select(gamma) %>% table()
cases.op.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% select(gamma) %>% table()
cases.law.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% select(gamma) %>% table()

cases.viol.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% summarize(mean(gamma), median(gamma))
cases.force.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% summarize(mean(gamma), median(gamma))
cases.op.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% summarize(mean(gamma), median(gamma))
cases.law.tree$parameters$best.tune$elevate.ADDTREE.repeat1 %>% summarize(mean(gamma), median(gamma))
```

Based on the median hyperparameter, I pick $\gamma = 0.4$ for every model but `h_operation` for which I choose $\gamma = 0.7$ for the full model.

## Full Models for Prediction and Visualization

Full model for `h_viol`:

```{r}
cases.viol.tree.full <- s.ADDTREE(cases.viol, gamma = 0.4, learning.rate = 0.001, seed = 2020)
saveRDS(cases.viol.tree.full, "models/cases-viol-tree-full.rds")

cases.viol.tree.full.viz <- dplot3.addtree(cases.viol.tree.full, fontname = "calibri")
cases.viol.tree.full.viz %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-viol-tree-generated.svg")
```

Full model for `h_force`:

```{r}
cases.force.tree.full <- s.ADDTREE(cases.force, gamma = 0.4, learning.rate = 0.001, seed = 2020)
saveRDS(cases.force.tree.full, "models/cases-force-tree-full.rds")

cases.force.tree.full.viz <- dplot3.addtree(cases.force.tree.full, fontname = "calibri")
cases.force.tree.full.viz %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-force-tree-generated.svg")
```

Full model for `h_operation`:

```{r}
cases.op.tree.full <- s.ADDTREE(cases.op, gamma = 0.4, learning.rate = 0.001, seed = 2020)
saveRDS(cases.op.tree.full, "models/cases-op-tree-full.rds")

cases.op.tree.full.viz <- dplot3.addtree(cases.op.tree.full, fontname = "calibri")
cases.op.tree.full.viz %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-op-tree-generated.svg")
```

Full model for `h_law`:

```{r}
cases.law.tree.full <- s.ADDTREE(cases.law, gamma = 0.4, learning.rate = 0.001, seed = 2020)
saveRDS(cases.law.tree.full, "models/cases-law-tree-full.rds")

cases.law.tree.full.viz <- dplot3.addtree(cases.law.tree.full, fontname = "calibri")
cases.law.tree.full.viz %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-law-tree-generated.svg")
```

### Manually reconstructing trees for higher legibility

The DiagrammeR library is used and the original layout and style is recreated but with better labels

```{r}
grViz("graphs/cases-viol-tree-graph.gv") %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-viol-tree-graph.svg")
```

```{r}
grViz("graphs/cases-force-tree-graph.gv") %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-force-tree-graph.svg")
```

```{r}
grViz("graphs/cases-op-tree-graph.gv") %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-op-tree-graph.svg")
```

```{r}
grViz("graphs/cases-law-tree-graph.gv") %>% 
  export_svg() %>% charToRaw() %>% 
  rsvg_svg(file = "graphs/cases-law-tree-graph.svg")
```

# Experiments

```{r}
#Repeat with all of them?
cases_test.tree <- cbind(cases$respondent, cases$year, cases$gc_dec, cases.viol) %>% s.ADDTREE(gamma = 0.4, learning.rate = 0.001, seed = 2020)
dplot3.addtree(cases_test.tree, fontname = "calibri")
```


```{r}
cases_test <- elevate(cases.viol, mod = "addtree",
                           resampler = "kfold", n.resamples = 10,
                           gamma = 0.3,
                           learning.rate = 0.001,
                           n.cores = 4,
                           seed = 2020)
```

```{r}
cases_test <- elevate(cases.viol, mod = "addtree",
                           resampler = "kfold", 
                           n.resamples = 2, n.repeats = 2,
                           gamma = 0.3,
                           learning.rate = 0.001,
                           n.cores = 4,
                           seed = 2020)

```


```{r}
cases_test.tree <- read_csv("cases_test.csv") %>% 
  preprocess(numeric2factor = TRUE) %>% 
  elevate(mod = "addtree", resampler = "kfold", n.resamples = 2, upsample = TRUE,
          grid.resample.rtset = rtset.resample("kfold", 2),
          gamma = c(0.5, 1))

cases_test.tree <- read_csv("cases_test.csv") %>% 
  preprocess(numeric2factor = TRUE) %>% 
  s.ADDTREE(outdir = "test")

dplot3.addtree(cases_test.tree$mod$elevate.ADDTREE.repeat1$ADDTREE2$mod1)
cases_test.tree$plotVarImp()
dplot3.addtree(cases_test2.tree)
plot(cases_test.tree$mod$addtree.pruned)
```


```{r}
cases_test <- cases %>% select(scrutiny:l_train, h_viol)
# cases_test <- cases_raw %>%
#   select(scrutiny:l_train, h_viol) %>%
#   mutate_all(list(~ str_replace(., "\\s=\\s.*", ""))) %>%
#   mutate_all(list(~ dplyr::recode(.,`-99` = "99",
#                                   `-88` = "88",
#                                    `-77` = "77"
#                                   )))
checkData(cases_test)
cases_test2 <- preprocess(cases_test, character2factor = TRUE)
# write_csv(cases_test2, "cases_test.csv")

# cases_test <- cases
# cases_test$violation <- cases$h_viol
# cases_test2 <- cases_test %>% 
#   mutate(o_evac = recode(o_evac, "0" = "1"), o_distinct = recode(o_distinct, "0" = "1")) %>% 
#   mutate(o_evac = replace_na(o_evac, 0), o_distinct = replace_na(o_distinct, 0)) %>% 
#   preprocess(impute = TRUE, character2factor = TRUE, impute.type = "missForest")


checkData(cases_test2)

cases_test.tree <- s.ADDTREE(cases_test, gamma = 0.5, learning.rate = 0.0001, upsample = TRUE)
dplot3.addtree(cases_test.tree)
print(cases_test.tree$mod$addtree.pruned, "Estimate", "N")

#With testing
res <- resample(cases_test, n.resamples = 10, resampler = "kfold")
cases_test.train <- cases_test[res$Fold_2, ]
cases_test.test <- cases_test[-res$Fold_2, ]
cases_test.tree <- s.ADDTREE(x = cases_test.train, x.test = cases_test.test, gamma = 2, learning.rate = 0.01)
dplot3.addtree(cases_test.tree)
```