project.Rmd

Reading in the data to a data.frame object.
```{r}
data <- read.csv("OSMI Mental Health in Tech Survey 2017-18.csv", header=TRUE)
```

Replacing the dates in the 'how.many.employees...' with proper ranges.
```{r}
levels(data$How.many.employees.does.your.company.or.organization.have.) <- c(levels(data$How.many.employees.does.your.company.or.organization.have.), "1-5", "6-25")
data$How.many.employees.does.your.company.or.organization.have.[data$How.many.employees.does.your.company.or.organization.have.=="5-Jan"] <- "1-5"
data$How.many.employees.does.your.company.or.organization.have.[data$How.many.employees.does.your.company.or.organization.have.=="25-Jun"] <- "6-25"
```

For all of the columns asking if they have a certain mental health issue, replaces the value with a '1' if they do and a '0' if they don't.
```{r}
#64 -> 89
for(i in 64:89){
  levels(data[[i]]) <- c(levels(data[[i]]), 0, 1)
  for(j in 1:nrow(data)){
    data[[i]][j] <- if(is.na(data[[i]][j]) | data[[i]][j] == "") 0 else 1
  }
}
```

Combines the results of the third set of questions about mental health issues into the second
```{r}
#77 -> 89
for(i in 89:77){
  for(j in 1:nrow(data)){
    if(data[[i]][j] == 1){
      data[[i - 13]][j] <- data[[i]][j]
    }
  }
  data[[i]] <- NULL
}
```

Removes all the repeat columns asking if they have a certain mental health issue that are completely blank.
```{r}
#51 -> 63
for(i in 63:51){
  data[[i]] <- NULL 
}
```

```{r}
table(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
```


Cleanup Diagnosis Check column

if there was a diagnosis on a specific disorder the "Have.you.ever.been.diagnosed.with.a.mental.health.disorder." column is marked as a "Yes"
```{r}
for(j in 1:nrow(data)){
  for(i in 63:51){
    if(data[[50]][j] == ""){
      if(data[[i]][j] == 1){
        data[[50]][j] <- "Yes"
      }
    }
  }
}
```
```{r}
table(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
```
```{r}
table(data$Do.you.currently.have.a.mental.health.disorder.)
table(data$Have.you.had.a.mental.health.disorder.in.the.past.)
```
```{r}
levels(data$Have.you.had.a.mental.health.disorder.in.the.past.)
levels(data$Do.you.currently.have.a.mental.health.disorder.)
levels(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
```

```{r}
data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder. <- as.character(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
```

```{r}
for(j in 1:nrow(data)){
  if(data[[50]][j] == ("") || is.na(data[[50]][j]) == TRUE){
    if(data[[49]][j] == "No" && data[[64]][j] == "No"){
      data[[50]][j] <- "No"
    }else if(data[[49]][j] == ("Don't Know")){
      if(data[[64]][j] == ("Don't Know")){
        data[[50]][j] <- "Maybe"
      }else if(data[[64]][j] == ("Possibly")){
        data[[50]][j] <- "Maybe"
      }else if(data[[64]][j] == "Yes"){
        data[[50]][j] <- "Maybe"
      }
    }else if(data[[49]][j] == "Possibly"){
      if(data[[64]][j] == "Don't Know"){
        data[[50]][j] <- "Maybe"
      }else if(data[[64]][j] == "Possibly"){
        data[[50]][j] <- "Maybe"
      }else if(data[[64]][j] == "Yes"){
        data[[50]][j] <- "Maybe"
      }
    }else if(data[[49]][j] == "No"){
      if(data[[64]][j] == "Don't Know"){
        data[[50]][j] <- "No"
      }else if(data[[64]][j] == "Possibly"){
        data[[50]][j] <- "No"
      }
    }
  }
}

data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder. <- as.factor(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
```

```{r}
table(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)
test <- c(table(data$Have.you.ever.been.diagnosed.with.a.mental.health.disorder.)[2:4])
sum(test)
data
```


Data Cleanup: Gender

Resources:
https://www.genderspectrum.org/quick-links/understanding-gender/
https://en.wikipedia.org/wiki/Non-binary_gender

```{r}
table(data$What.is.your.gender.)
```
```{r}
cis_female <- c( 'Female', 'female', 'Female ', 'Woman', 'woman', 'F', 'f', 'I identify as female', '*shrug emoji* (F)', 'Cis woman', 'Female (cisgender)', 'Cis-Female', 'Cisgendered woman', 'She/her/they/them', 'Cis female', 'cisgender female', 'femalw', 'femail', 'female (cis)', 'My sex is female.', 'female (cisgender)', 'Female (cis) ', 'Woman-identified', 'cis-Female', 'cis female', 'F, cisgender', 'Cis female ')
cis_male <- c('male', 'Male', 'Ostensibly Male', 'male, born with xy chromosoms', 'Malel', 'M', 'MALE', 'm', 'Cis-male', 'Male ', 'cis male ', 'Cis Male', 'Man', 'Cisgender male', 'cis-male', 'Mail', 'cis hetero male', 'Male (cis)', "male (hey this is the tech industry you're talking about)", 'God King of the Valajar', 'Cis male', 'man', 'Male, cis', 'cis male', 'dude' , 'SWM')
transgender <- c('transgender', 'Trans woman', 'Trans female', 'trans woman', 'Transfeminine', 'Trans man')
gq <- c('male/androgynous ', 'Agender', '', 'Nonbinary', 'Male (or female, or both)', 'non binary', 'Female/gender non-binary.', 'N/A', 'genderfluid', 'Genderqueer', 'Demiguy', 'none', 'non-binary', 'Other', 'NB', 'Genderfluid', 'Nonbinary/femme', 'gender non-conforming woman', 'uhhhhhhhhh fem genderqueer?', 'n/a', 'Non-binary', 'Agender/genderfluid', 'Male-ish', 'sometimes', 'Contextual', 'Non binary', 'Genderqueer demigirl', 'Genderqueer/non-binary', 'nonbinary', 'Female-ish', '\\-', 'None')
```

```{r}
gender <- as.vector(data$What.is.your.gender.)
n <- length(gender)
for(i in 1:n){
  if(gender[i] %in% cis_female){
    gender[i] <- "cis-female"
  }else if (gender[i] %in% gq){
    gender[i] <- "genderqueer"
  }else if (gender[i] %in% transgender){
    gender[i] <- "transgender"
  }else if (gender[i] %in% cis_male){
    gender[i] <- "cis-male"
  }else{
    gender[i]
  }
}
```

```{r}
table(gender)
```

```{r}
data$What.is.your.gender. <- gender
```

```{r}
data$What.is.your.gender. <- as.factor(data$What.is.your.gender.)
```


```{r}
table(data$What.is.your.gender.)
```
```{r}
data
```
```{r}
str(data$What.is.your.gender.)
head(data)
```


Creating of a model by Felix Quintana

Plan:
Create a model that will predict the score of a employee on the tech inudstry, this provides both a psuedo score (Using standard random forest I am not getting class probility estimations, only classification) and label importance for 
```{r}
for(i in 1:ncol(data)){
  data[,i] <- factor(data[,i]);
}
```
Okay so the second thing I want to do is preprocess the data somehow. I first begin by getting rid of useless data such as those who are self employed.


```{r}
data[,3] <- na.omit(data[,3]);
data[,2] <- na.omit(data[,2]);
try(
for(i in 1:nrow(data)){
  if (data[i,2] == 1){
    data <- data[-c(i),];
  }
}
)
#Checking if all the self employed ppl are gone. 
data

randomForstCurrentEmployerModel <- data[,c(3,4,5,6,7,8,9,10,11,12,13,14,17,19,21,22,83)];#20,22,23,84)];
randomForestCurrentAndPreviousEmployer <- data[,c(3,4,5,6,7,8,9,10,11,12,13,14,17,19,21,22,31,32,33,34,35,36,37,38,39,40,45,47,48,83)];
#3-15,17,18,20,22,23, 84;
randomForstCurrentEmployerModel <- na.omit(randomForstCurrentEmployerModel);
```
Now to make a new dataframe just containing whats interesting for me.
```{r}
#install.packages("randomForest");
library(randomForest);
library(randomForestExplainer);
# Split into Train and Validation sets
# Training Set : Validation Set = 70 : 30 (random)
set.seed(100);
train <- sample(nrow(randomForstCurrentEmployerModel), 0.7*nrow(randomForstCurrentEmployerModel), replace = FALSE);
TrainSet<- randomForstCurrentEmployerModel[train,];
#TrainSet
ValidSet <- randomForstCurrentEmployerModel[-train,];
model1 <- randomForest(Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues. ~ ., data = TrainSet, importance = TRUE)

#predTrain <- predict(model1, TrainSet, type = "class")
# Checking classification accuracy
#table(predTrain, TrainSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)
# Predicting on Validation set
predValid <- predict(model1, ValidSet, type = "class")
# Checking classification accuracy
mean(predValid == ValidSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)                    
table(predValid,ValidSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)


```
second model
```{r}
randomForestCurrentAndPreviousEmployer <- na.omit(randomForestCurrentAndPreviousEmployer);
str(randomForestCurrentAndPreviousEmployer);
train <- sample(nrow(randomForestCurrentAndPreviousEmployer), 0.7*nrow(randomForestCurrentAndPreviousEmployer), replace = FALSE);
TrainSet<- randomForestCurrentAndPreviousEmployer[train,];
#TrainSet
ValidSet <- randomForestCurrentAndPreviousEmployer[-train,];
model1 <- randomForest(Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues. ~ ., data = TrainSet, importance = TRUE)

#predTrain <- predict(model1, TrainSet, type = "class")
# Checking classification accuracy
#table(predTrain, TrainSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)
# Predicting on Validation set
predValid <- predict(model1, ValidSet, type = "class")
# Checking classification accuracy
mean(predValid == ValidSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)                    
table(predValid,ValidSet$Overall..how.well.do.you.think.the.tech.industry.supports.employees.with.mental.health.issues.)
```
Time for visualzation
```{r}
min_depth_frame <- min_depth_distribution(model1)

```

end