-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcv.R
277 lines (237 loc) · 14.3 KB
/
cv.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
############################################################################
# CROSS VALIDATION #########################################################
############################################################################
############################################################################
# cv #######################################################################
############################################################################
# Cross-Validation Function
# This function performs cross-validation on the input dataset using different models (decision tree, random forest, logistic regression).
# It trains models, makes predictions, calculates performance metrics, and evaluates them across different thresholds.
#
# Arguments:
# - dataset: The input dataset for cross-validation. The dataset should contain the dependent variable 'euro_d'.
# - seed: An integer to set the random seed for reproducibility (default is 123).
# - k_folds: The number of folds to use in cross-validation (default is 10).
#
# Returns:
# - A list containing:
# 1. results: A list of data frames with the calculated metrics (accuracy, sensitivity, specificity, PPV, NPV, kappa) for each model and fold.
# 2. all_predictions: A data frame of all the predictions made during cross-validation, including the fold index, model type, true labels, and predicted probabilities.
# 3. thresholds_results: A data frame containing performance metrics calculated at different threshold levels for each model.
#
# Details:
# - The function first sets up cross-validation using the `createFolds` function.
# - It loops through the specified models ('rpart' for decision tree, 'rf' for random forest, and 'glm' for logistic regression).
# - For each model, it performs training and testing on each fold:
# - Trains the model on the training data and makes predictions on the test data.
# - Stores the predicted probabilities, true labels, and model information in `all_predictions`.
# - Calculates the confusion matrix and extracts performance metrics (accuracy, sensitivity, specificity, kappa, PPV, NPV).
# - Appends these metrics to the `results` list.
# - The function evaluates performance across a range of probability thresholds (0.05 to 0.95) using the `calculate_metrics` function, storing these results in `thresholds_results`.
cv <- function(dataset,model_types = c("rpart", "rf", "glm"), seed = 123, k_folds = 10) {
set.seed(seed)
results <- list()
all_predictions <- data.frame()
thresholds_results <- data.frame()
thresholds <- seq(0.05, 0.95, by = 0.05)
folds <- createFolds(dataset$euro_d, k = k_folds)
for (model_type in model_types) {
best_fold_accuracy <- 0
best_fold_features <- NULL
for (fold_idx in seq_along(folds)) {
train_data <- dataset[-folds[[fold_idx]], ]
test_data <- dataset[folds[[fold_idx]], ]
model <- train_model(model_type, train_data)
predictions <- make_predictions(model, test_data, model_type)
fold_predictions <- data.frame(
Fold = fold_idx,
Model = model_type,
TrueLabel = test_data$euro_d,
PredictedProb = predictions
)
all_predictions <- rbind(all_predictions, fold_predictions)
predicted_class <- ifelse(predictions > 0.5, "yes", "no")
observed_factor <- factor(test_data$euro_d, levels = c("no", "yes"))
cm <- confusionMatrix(factor(predicted_class, levels = c("no", "yes")), observed_factor)
accuracy <- cm$overall["Accuracy"]
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
PPV <- cm$byClass["Pos Pred Value"]
NPV <- cm$byClass["Neg Pred Value"]
kappa <- cm$overall["Kappa"]
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "ACC", Value = accuracy)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "SENS", Value = sensitivity)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "SPEC", Value = specificity)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "Kappa", Value = kappa)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "PPV", Value = PPV)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "NPV", Value = NPV)))
for (threshold in thresholds) {
metrics <- calculate_metrics(threshold, predictions, test_data$euro_d)
thresholds_results <- rbind(thresholds_results, data.frame(Model = model_type, Threshold = threshold * 100, t(metrics)))
}
}
}
return(list(results = results, all_predictions = all_predictions, thresholds_results = thresholds_results))
}
############################################################################
# double_cv ################################################################
############################################################################
# Double Cross-Validation Function
# This function performs a nested cross-validation on the input dataset using different models (decision tree, random forest, logistic regression).
# It balances the training data, trains models, makes predictions, and calculates performance metrics across multiple runs.
#
# Arguments:
# - dataset: The input dataset for cross-validation. The dataset should contain the dependent variable 'euro_d'.
# - seed: An integer to set the random seed for reproducibility (default is 123).
# - N: The number of times the cross-validation process is repeated (default is 5).
# - k_folds: The number of folds to use in cross-validation (default is 10).
# - balance: The method for balancing the dataset ("under" for undersampling or "over" for oversampling; default is "under").
#
# Returns:
# - A list containing:
# 1. results: A list of data frames with the calculated metrics (accuracy, sensitivity, specificity, PPV, NPV, kappa) for each model, fold, and repetition.
# 2. all_predictions: A data frame of all predictions made during cross-validation, including the fold index, model type, true labels, and predicted probabilities.
# 3. thresholds_results: A data frame containing performance metrics calculated at different threshold levels for each model.
#
# Details:
# - The function sets up a nested cross-validation with `N` repetitions and `k_folds` splits.
# - It balances the training data in each fold using the specified method ('under' or 'over') by calling the `balancing_dataset` function.
# - For each repetition (`N`), it loops through the specified models ('rpart' for decision tree, 'rf' for random forest, 'glm' for logistic regression):
# - Splits the data into `k_folds` using the `createFolds` function.
# - For each fold, it trains the model on the balanced training data and makes predictions on the test data.
# - Stores predicted probabilities, true labels, and model information in `all_predictions`.
# - Calculates the confusion matrix and extracts performance metrics (accuracy, sensitivity, specificity, kappa, PPV, NPV).
# - Appends these metrics to the `results` list.
# - The function evaluates performance across a range of probability thresholds (0.05 to 0.95) using the `calculate_metrics` function, storing these results in `thresholds_results`.
double_cv <- function(dataset, model_types = c("rpart", "rf", "glm"), seed = 123, N = 5, k_folds = 10, balance = "under") {
set.seed(seed)
results <- list()
all_predictions <- data.frame()
thresholds_results <- data.frame()
thresholds <- seq(0.05, 0.95, by = 0.05)
for (n in 1:N) {
folds <- createFolds(dataset$euro_d, k = k_folds)
for (model_type in model_types) {
for (fold_idx in seq_along(folds)) {
train_data <- dataset[-folds[[fold_idx]], ]
test_data <- dataset[folds[[fold_idx]], ]
balanced_train_data <- balancing_dataset(train_data, method = balance)
model <- train_model(model_type, balanced_train_data)
predictions <- make_predictions(model, test_data, model_type)
fold_predictions <- data.frame(
Fold = fold_idx,
Model = model_type,
TrueLabel = test_data$euro_d,
PredictedProb = predictions
)
all_predictions <- rbind(all_predictions, fold_predictions)
predicted_class <- ifelse(predictions > 0.5, "yes", "no")
observed_factor <- factor(test_data$euro_d, levels = c("no", "yes"))
cm <- confusionMatrix(factor(predicted_class, levels = c("no", "yes")), observed_factor)
accuracy <- cm$overall["Accuracy"]
sensitivity <- cm$byClass["Sensitivity"]
specificity <- cm$byClass["Specificity"]
PPV <- cm$byClass["Pos Pred Value"]
NPV <- cm$byClass["Neg Pred Value"]
kappa <- cm$overall["Kappa"]
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "ACC", Value = accuracy)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "SENS", Value = sensitivity)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "SPEC", Value = specificity)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "Kappa", Value = kappa)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "PPV", Value = PPV)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "NPV", Value = NPV)))
for (threshold in thresholds) {
metrics <- calculate_metrics(threshold, predictions, test_data$euro_d)
thresholds_results <- rbind(thresholds_results, data.frame(Model = model_type, Threshold = threshold * 100, t(metrics)))
}
}
}
}
return(list(results = results, all_predictions = all_predictions, thresholds_results = thresholds_results))
}
############################################################################
# score_cross_validation ###################################################
############################################################################
# Score Cross-Validation Function
# This function performs a cross-validation on the input dataset using specified regression models and calculates evaluation metrics.
#
# Arguments:
# - dataset: The input dataset for cross-validation. It should contain the dependent variable 'euro_d'.
# - model_types: A vector specifying the types of models to use for training (default is c("rpart", "rf", "glm")).
# - "rpart": Decision tree regression model.
# - "rf": Random forest regression model.
# - "glm": Linear regression model.
#
# Returns:
# - A list containing:
# 1. best_MSE: A list of the lowest Mean Squared Error (MSE) achieved for each model type across all folds.
# 2. results: A list of data frames containing performance metrics (MSE, MAE, R2) for each model type and fold.
# 3. all_predictions: A data frame with predictions for each fold, including the fold index, model type, true labels, and predicted values.
#
# Details:
# - The function sets a seed for reproducibility and initializes lists to store results and best MSEs for each model type.
# - The dataset is split into 10 folds using the `createFolds` function.
# - For each model type, the following steps are performed:
# - The data is split into training and test sets based on the current fold.
# - A model is trained using the `score_train_model` function on the training data.
# - Predictions are made on the test data using the `score_make_predictions` function.
# - Evaluation metrics are calculated:
# - Mean Squared Error (MSE).
# - Mean Absolute Error (MAE).
# - R-squared (R2).
# - The metrics are stored in the `results` list.
# - Predictions for the current fold are added to `all_predictions`.
# - The best MSE for the current fold is updated if the new MSE is lower.
# - The function returns the best MSE for each model, the complete results list, and all predictions.
score_cross_validation <- function(dataset, model_types = c("rpart", "rf", "glm")) {
set.seed(123)
best_overall_MSE <- list()
results <- list()
all_predictions <- data.frame()
folds <- createFolds(dataset$euro_d, k = 10)
for (model_type in model_types) {
best_fold_MSE <- Inf
best_overall_MSE[[model_type]] <- Inf
for (fold_idx in seq_along(folds)) {
all_features <- setdiff(names(dataset), 'euro_d')
train_index <- -folds[[fold_idx]]
test_index <- folds[[fold_idx]]
selected_features <- all_features
current_data <- dataset[, c(selected_features, 'euro_d')]
train_data <- current_data[train_index, ]
test_data <- current_data[test_index, ]
model <- score_train_model(model_type, train_data)
predictions <- score_make_predictions(model, test_data, model_type)
#valid_indices <- which(predictions <= 12)
#predictions <- predictions[valid_indices]
#observed_values <- test_data$euro_d[valid_indices]
observed_values <- test_data$euro_d
if(length(predictions)>0){
MSE <- mean((predictions - observed_values)^2)
MAE <- mean(abs(predictions - observed_values))
R2 <- 1 - sum((predictions - observed_values)^2) / sum((observed_values - mean(observed_values))^2)
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "MSE", Value = MSE)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "MAE", Value = MAE)))
results <- append(results, list(data.frame(Model = model_type, Fold = fold_idx, Indicator = "R2", Value = R2)))
fold_predictions <- data.frame(
Fold = fold_idx,
Model = model_type,
TrueLabel = observed_values,
PredictedValues = predictions
)
all_predictions <- rbind(all_predictions, fold_predictions)
if (MSE < best_fold_MSE) {
best_fold_MSE <- MSE
}
}
}
if (best_fold_MSE < best_overall_MSE[[model_type]]) {
best_overall_MSE[[model_type]] <- best_fold_MSE
}
}
return(list(
best_MSE = best_overall_MSE,
results = results,
all_predictions = all_predictions
))
}