-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBalanced Data.R
44 lines (37 loc) · 1.33 KB
/
Balanced Data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(dplyr)
#Checking balance of dataset
health_data <- read.csv("health_data.csv")
outcome_breakdown <- table(health_data$Outcome)
print(outcome_breakdown)
outcome_breakdown_percentage <- prop.table(outcome_breakdown) * 100
print(outcome_breakdown_percentage)
#Converting data to factors
health_data <- health_data %>%
mutate(
Gender = as.factor(Gender),
SmokingStatus = as.factor(SmokingStatus),
AlcoholConsumption = as.factor(AlcoholConsumption),
ExerciseFrequency = as.factor(ExerciseFrequency),
HeartDisease = as.factor(HeartDisease),
Diabetes = as.factor(Diabetes),
PhysicalActivityLevel = as.factor(PhysicalActivityLevel),
DietQuality = as.factor(DietQuality),
Outcome = as.factor(Outcome)
)
## balancing dataset via random undersampling
#creating dfs based off outcomes
healthy <- health_data %>%
filter(Outcome == "Healthy")
at_risk <- health_data %>%
filter(Outcome == "At Risk")
critical <- health_data %>%
filter(Outcome == "Critical")
set.seed(6203)
samples_critical <- critical %>%
sample_n(size = nrow(at_risk))
samples_healthy <- healthy %>%
sample_n(size = nrow(at_risk))
health_data_balanced <- bind_rows(samples_healthy, at_risk, samples_critical)
health_data_balanced <- health_data_balanced[sample(nrow(health_data_balanced)),]
summary(health_data)
summary(health_data_balanced)