-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulticlass-roccurve.py
116 lines (96 loc) · 3.99 KB
/
multiclass-roccurve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
def calculate_tpr_fpr(y_real, y_pred):
'''
Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations
Args:
y_real: The list or series with the real classes
y_pred: The list or series with the predicted classes
Returns:
tpr: The True Positive Rate of the classifier
fpr: The False Positive Rate of the classifier
'''
# Calculates the confusion matrix and recover each element
cm = confusion_matrix(y_real, y_pred)
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]
# Calculates tpr and fpr
tpr = TP/(TP + FN) # sensitivity - true positive rate
fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate
return tpr, fpr
def get_all_roc_coordinates(y_real, y_proba):
'''
Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a threshold for the predicion of the class.
Args:
y_real: The list or series with the real classes.
y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.
Returns:
tpr_list: The list of TPRs representing each threshold.
fpr_list: The list of FPRs representing each threshold.
'''
tpr_list = [0]
fpr_list = [0]
for i in range(len(y_proba)):
threshold = y_proba[i]
y_pred = y_proba >= threshold
tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
tpr_list.append(tpr)
fpr_list.append(fpr)
return tpr_list, fpr_list
def plot_roc_curve(tpr, fpr, scatter = True, ax = None):
'''
Plots the ROC Curve by using the list of coordinates (tpr and fpr).
Args:
tpr: The list of TPRs representing each coordinate.
fpr: The list of FPRs representing each coordinate.
scatter: When True, the points used on the calculation will be plotted with the line (default = True).
'''
if ax == None:
plt.figure(figsize = (5, 5))
ax = plt.axes()
if scatter:
sns.scatterplot(x = fpr, y = tpr, ax = ax)
sns.lineplot(x = fpr, y = tpr, ax = ax)
sns.lineplot(x = [0, 1], y = [0, 1], color = 'green', ax = ax)
plt.xlim(-0.05, 1.05)
plt.ylim(-0.05, 1.05)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
def plot_multiclass_roc_curves(classes, categories, X_test, y_proba, y_test):
# Plots the Probability Distributions and the ROC Curves One vs Rest
plt.figure(figsize = (9, 5))
bins = [i/20 for i in range(20)] + [1]
#classes = AdaBoost.classes_
print(classes)
roc_auc_ovr = {}
for i in range(len(classes)):
# Gets the class
c = classes[i]
# Prepares an auxiliar dataframe to help with the plots
df_aux = X_test.copy()
df_aux['class'] = [1 if y == c else 0 for y in y_test]
df_aux['prob'] = y_proba[:, i]
df_aux = df_aux.reset_index(drop = True)
# Plots the probability distribution for the class and the rest
ax = plt.subplot(2, 4, i+1)
sns.histplot(x = "prob", data = df_aux, hue = 'class', color = 'b', ax = ax, bins = bins)
ax.set_title(c)
ax.legend([f"Class: {c}", "Rest"])
ax.set_xlabel(f"P(x = {c})")
# Calculates the ROC Coordinates and plots the ROC Curves
ax_bottom = plt.subplot(2, 4, i+5)
tpr, fpr = get_all_roc_coordinates(df_aux['class'], df_aux['prob'])
plot_roc_curve(tpr, fpr, scatter = False, ax = ax_bottom)
ax_bottom.set_title(f"{categories[i]} - Rest")
# Calculates the ROC AUC OvR
roc_auc_ovr[c] = roc_auc_score(df_aux['class'], df_aux['prob'])
plt.tight_layout()