-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwindowing.R
163 lines (140 loc) · 7.45 KB
/
windowing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# Function to dynamically classify euro_d and initial_euro_d based on thresholds
# `dep_ones` and `not_dep_ones` and handle missing values.
dinamic_classification_euro_score_dataset <- function(data, dep_ones, not_dep_ones) {
# Classify initial_euro_d as "yes" or "no" based on dep_ones and not_dep_ones
data <- data %>%
mutate(
initial_euro_d = case_when(
initial_euro_d >= dep_ones ~ "yes", # Mark as "yes" if greater than or equal to dep_ones
initial_euro_d < not_dep_ones & initial_euro_d != -1 ~ "no", # Mark as "no" if less than not_dep_ones
TRUE ~ NA_character_ # Assign NA if neither condition is met
),
# Add labels to `initial_euro_d` for clarity
initial_euro_d = labelled(initial_euro_d, labels = c("depressed" = "yes", "not depressed" = "no"), label = "Depression at BaseLine")
)
# Handle missing values for different data types
data <- data %>%
mutate(across(where(is.integer), ~if_else(is.na(.), as.integer(-1), .)),
across(where(is.double), ~if_else(is.na(.), as.double(-1), .)),
across(where(is.character), ~if_else(is.na(.), "-1", .)))
# Classify euro_d as "yes" or "no" based on dep_ones and not_dep_ones
data <- data %>%
mutate(
euro_d = case_when(
euro_d >= dep_ones ~ "yes", # Mark as "yes" if greater than or equal to dep_ones
euro_d < not_dep_ones ~ "no", # Mark as "no" if less than not_dep_ones
TRUE ~ NA_character_ # Assign NA if neither condition is met
),
# Add labels to `euro_d` for clarity
euro_d = labelled(euro_d, labels = c("depressed" = "yes", "not depressed" = "no"), label = "Depression")
)
return(data) # Return the processed dataset
}
# Initialize variables for storing results
final_results <- list() # Store final results for each dep-not_dep combination
n_generations <- 50 # Number of generations for genetic algorithm
pop_size <- 50 # Population size for genetic algorithm
# Loop over combinations of dep_ones and not_dep_ones
for (dep in 2:12) {
for (not_dep in 1:dep) {
# Dynamically classify the dataset
data <- dinamic_classification_euro_score_dataset(dataset, dep_ones = dep, not_dep_ones = not_dep)
data <- data %>% filter(!is.na(euro_d)) # Remove rows with NA in euro_d
# Check criteria for minimum row count and class balance
if (nrow(data) > 1500 && all(table(data$euro_d) >= 600)) {
model_types <- c("rf") # Define the model type (e.g., Random Forest)
seed <- 123 # Set random seed for reproducibility
N <- 5 # Number of iterations for cross-validation
k_folds <- 10 # Number of folds for cross-validation
balance <- "under" # Define the balancing strategy
set.seed(seed) # Set seed
results <- list() # Store results for each fold
best_N_accuracy <- 0 # Track the best accuracy across folds
best_N_features <- NA # Track the best features across folds
for (n in 1:N) {
folds <- createFolds(data$euro_d, k = k_folds) # Generate cross-validation folds
for (model_type in model_types) {
best_fold_accuracy <- 0 # Track best accuracy for the current fold
best_fold_features <- NULL # Track best features for the current fold
for (fold_idx in seq_along(folds)) {
all_features <- setdiff(names(data), 'euro_d') # Exclude target variable
train_index <- -folds[[fold_idx]] # Training set indices
test_index <- folds[[fold_idx]] # Test set indices
# Train genetic algorithm for feature selection
ga_model <- ga(
type = "binary",
fitness = function(selected_features) {
res <- fitness_function_bl_under(selected_features, data, train_index, test_index, model_type, fold_idx)
return(res$accuracy) # Fitness is based on accuracy
},
nBits = length(all_features), # Number of bits for genetic algorithm
maxiter = n_generations, # Maximum number of generations
popSize = pop_size, # Population size
names = all_features, # Feature names
run = 50, # Maximum iterations without improvement
keepBest = TRUE # Keep the best solution
)
best_solution <- ga_model@solution[1, ] # Extract the best solution
selected_features <- all_features[which(best_solution == 1)] # Get selected features
# Evaluate the solution
res <- fitness_function_bl_under(best_solution, data, train_index, test_index, model_type, fold_idx)
accuracy <- res$accuracy
sensitivity <- res$sensitivity
specificity <- res$specificity
# Store results for this fold
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "Accuracy", Value = accuracy)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "Sensitivity", Value = sensitivity)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "Specificity", Value = specificity)))
# Update the best fold results
if (accuracy > best_fold_accuracy) {
best_fold_accuracy <- accuracy
best_fold_features <- selected_features
}
}
}
# Update the best results across all folds
if (best_fold_accuracy > best_N_accuracy) {
best_N_accuracy <- best_fold_accuracy
best_N_features <- best_fold_features
}
}
# Aggregate results and calculate averages
combined_results <- do.call(rbind, results)
avg_accuracy <- mean(combined_results$Value[combined_results$Indicator == "Accuracy"], na.rm = TRUE)
avg_sensitivity <- mean(combined_results$Value[combined_results$Indicator == "Sensitivity"], na.rm = TRUE)
avg_specificity <- mean(combined_results$Value[combined_results$Indicator == "Specificity"], na.rm = TRUE)
# Save the results for the current dep-not_dep combination
final_results[[paste0("dep_", dep, "_not_dep_", not_dep)]] <- list(
Accuracy = avg_accuracy,
Sensitivity = avg_sensitivity,
Specificity = avg_specificity,
SelectedFeatures = best_N_features
)
} else {
# If criteria are not met, save empty results
final_results[[paste0("dep_", dep, "_not_dep_", not_dep)]] <- list(
Accuracy = NA,
Sensitivity = NA,
Specificity = NA,
SelectedFeatures = NA
)
}
}
}
# Convert final_results into a data frame with detailed descriptions
results_df <- do.call(rbind, lapply(names(final_results), function(name) {
# Extract dep and not_dep values from the name
dep_value <- as.numeric(sub(".*dep_(\\d+)_not_dep_.*", "\\1", name))
not_dep_value <- as.numeric(sub(".*not_dep_(\\d+)", "\\1", name))
# Create a readable description of the combination
combination_description <- paste0("euro_d >= ", dep_value, " ~ yes, euro_d < ", not_dep_value, " ~ no")
# Create a row with the combination and metrics
data.frame(
Combination = combination_description,
Accuracy = final_results[[name]]$Accuracy,
Sensitivity = final_results[[name]]$Sensitivity,
Specificity = final_results[[name]]$Specificity
)
}))
# Print the results in the new format
print(results_df)