-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatasets.py
155 lines (117 loc) · 5.83 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import numpy as np
from sklearn.datasets import make_blobs
import utils_config
import pandas as pd
config_file_d="./conf/datasets.json"
config_d = utils_config.load_config(config_file_d)
config_d = utils_config.serialize_config(config_d)
PROTECTED = config_d['PROTECTED']
NOT_PROTECTED = config_d['NOT_PROTECTED']
POSITIVE = config_d['POSITIVE']
NEGATIVE = config_d['NEGATIVE']
def get_and_preprocess_cc():
""""Handle processing of Communities and Crime. We exclude rows with missing values and predict
if the violent crime is in the 50th percentile.
Parameters
----------
params : Params
Returns:
----------
Pandas data frame X of processed data, np.ndarray y, and list of column names
"""
X = pd.read_csv("./data/communities_and_crime.csv", index_col=0)
# everything over 50th percentil gets negative outcome (lots of crime is bad)
high_violent_crimes_threshold = 50
y_col = 'ViolentCrimesPerPop numeric'
X = X[X[y_col] != "?"]
X[y_col] = X[y_col].values.astype('float32')
# just dump all x's that have missing values
cols_with_missing_values = []
for col in X:
if '?' in X[col].values.tolist():
cols_with_missing_values.append(col)
y = X[y_col]
y_cutoff = np.percentile(y, high_violent_crimes_threshold)
protected_col = X['racepctblack numeric']
protected_attribute = np.array([PROTECTED if val > np.percentile(protected_col, 50) else NOT_PROTECTED for val in protected_col])
X = X.drop(cols_with_missing_values + ['racepctblack numeric', 'communityname string', 'fold numeric', 'county numeric', 'community numeric', 'state numeric'] + [y_col], axis=1)
# setup ys
y = np.array([NEGATIVE if val > y_cutoff else POSITIVE for val in y])
return X.values, y, protected_attribute, np.array([]) #np.array([17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 74, 34, 72, 80, 56, 13, 23, 84, 40, 68,1,2,4,5,6,7,8,9,10,11,12,14,15,16])
def get_and_preprocess_german():
X = pd.read_csv("data/german_processed.csv")
y = X["GoodCustomer"]
X = X.drop(["GoodCustomer", "PurposeOfLoan"], axis=1)
X['Gender'] = [NOT_PROTECTED if v == "Male" else PROTECTED for v in X['Gender'].values]
y = np.array([POSITIVE if p == 1 else NEGATIVE for p in y.values])
p = X['Gender'].values
X = X.drop(['Gender'], axis=1)
num = [2,3,4,5,6,7,8]
cat = [v for v in range(X.shape[1]) if v not in num]
cols = [c for c in X]
return X.values, y, p, np.array(cat)
def get_and_preprocess_adult():
from dice_ml.utils import helpers
data = helpers.load_adult_income_dataset()
y = data['income'].values
p = data['gender'].values
p = np.array([PROTECTED if v == 'Female' else NOT_PROTECTED for v in p])
data = data.drop(['income', 'gender'], axis=1)
data = pd.get_dummies(data)
X = data.values
cols = [c for c in data]
cat = [i for i, c in enumerate(cols) if c not in ['age', 'hours_per_week']]
return X, y, p, np.array(cat)
def get_and_preprocess_compas_data():
"""Handle processing of COMPAS according to: https://github.com/propublica/compas-analysis
Parameters
----------
params : Params
Returns
----------
Pandas data frame X of processed data, np.ndarray y, and list of column names
"""
compas_df = pd.read_csv("data/compas-scores-two-years.csv", index_col=0)
compas_df = compas_df.loc[(compas_df['days_b_screening_arrest'] <= 30) &
(compas_df['days_b_screening_arrest'] >= -30) &
(compas_df['is_recid'] != -1) &
(compas_df['c_charge_degree'] != "O") &
(compas_df['score_text'] != "NA")]
compas_df['length_of_stay'] = (pd.to_datetime(compas_df['c_jail_out']) - pd.to_datetime(compas_df['c_jail_in'])).dt.days
# X = compas_df[['age', 'c_charge_degree', 'race', 'sex', 'priors_count', 'length_of_stay']]
X = compas_df[['age', 'race', 'priors_count', 'length_of_stay']]
# print (compas_df['score_text'])
# if person has high score give them the _negative_ model outcome
y = np.array([NEGATIVE if score == 'High' else POSITIVE for score in compas_df['score_text']])
sens = X.pop('race')
# assign African-American as the protected class
X = pd.get_dummies(X)
sensitive_attr = np.array(pd.get_dummies(sens).pop('African-American'))
sensitive_attr = np.array([PROTECTED if val == 1 else NOT_PROTECTED for val in sensitive_attr])
return X.values, y, sensitive_attr
def get_data_set(name):
"""
Gets the data set given the name. If sythetic, creates blobs sythetic dataset example.
"""
if name == "synthetic":
data, t_labels = make_blobs(n_samples=100, n_features=2, cluster_std=2, centers=[[8,5], [3,1], [-4,1], [-8,5]])
labels = np.zeros_like(t_labels)
labels[np.logical_or(t_labels == 0, t_labels == 1)] = POSITIVE
labels[np.logical_or(t_labels == 2, t_labels == 3)] = NEGATIVE
protected = np.zeros_like(t_labels)
protected[np.logical_or(t_labels == 0, t_labels == 3)] = PROTECTED
protected[np.logical_or(t_labels == 1, t_labels == 2)] = NOT_PROTECTED
cats = np.array([])
elif name == "compas":
data, labels, protected = get_and_preprocess_compas_data()
elif name == "cc":
data, labels, protected, cats = get_and_preprocess_cc()
elif name == "german":
data, labels, protected, cats = get_and_preprocess_german()
elif name == "adult":
data, labels, protected, cats = get_and_preprocess_adult()
else:
raise NotImplementedError
training = np.random.choice(data.shape[0], size=int(data.shape[0] * 0.9))
testing = np.array([i for i in range(data.shape[0]) if i not in training])
return data[training], labels[training], protected[training], data[testing], labels[testing], protected[testing], cats