-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
232 lines (199 loc) · 10.3 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import pickle
from typing import Dict
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
import numpy as np
import os
from numpy import ndarray
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
DATA_DIR = 'data'
FILE_NAMES = ['normal_segments_sub.pkl', 'disease_segments_sub.pkl']
def remove_nan(features, labels):
nan_indices = np.argwhere(np.isnan(features))
nan_row_numbers = nan_indices[:, 0]
x_non_nan = np.delete(features, nan_row_numbers, axis=0)
y_non_nan = np.delete(labels, nan_row_numbers, axis=0)
return x_non_nan, y_non_nan
def augment(features, labels):
"""
TODO: SMOTE only takes 2 dimensional data, reshaping only features leads to dimension mismatch with labels. FIX it.
:param features:
:param labels:
:return:
"""
classes, counts = np.unique(labels, return_counts=True)
original_feature_shape = (-1,) + features.shape[1:]
num_samples = features.shape[0]
print("Before Resampling")
for idx, cls in enumerate(classes):
print("Class:", cls, " Count:", counts[idx])
over_sampler = SMOTE(k_neighbors=10, sampling_strategy='not majority')
x_aug, y_aug = over_sampler.fit_resample(features.reshape(num_samples, -1), labels)
print("After Resampling")
classes, counts = np.unique(y_aug, return_counts=True)
for idx, cls in enumerate(classes):
print("Class:", cls, " Count:", counts[idx])
x_aug = x_aug.reshape(original_feature_shape)
return x_aug, y_aug
def load_data(file_names, classification=0, test_split=0.33):
with open(os.path.join(DATA_DIR, file_names[0]), 'rb') as f: # read preprocessing result
if 'normal' in file_names[0]:
normal_data_dict = pickle.load(f)
else:
disease_data_dict = pickle.load(f)
with open(os.path.join(DATA_DIR, file_names[1]), 'rb') as f: # read preprocessing result
if 'normal' in file_names[1]:
normal_data_dict = pickle.load(f)
else:
disease_data_dict = pickle.load(f)
# normal_derived_data = normal_data_dict['derived']
# normal_waveform_data = normal_data_dict['waveform']
normal_psg_data = normal_data_dict['psg']
normal_labels = normal_data_dict['label']
# disease_derived_data = disease_data_dict['derived']
# disease_waveform_data = disease_data_dict['waveform']
disease_psg_data = disease_data_dict['psg']
disease_binary_labels = disease_data_dict['binary_label']
disease_multi_labels = disease_data_dict['multi_label']
# x = np.reshape(waveform_data, (-1, 80, 4))
# x = np.average(x, axis=1) # Reducing dimension by taking average of the 80 data points
# Combine disease and normal data and then shuffle them
# derived_data = np.concatenate((normal_derived_data, disease_derived_data), axis=0)
# waveform_data = np.concatenate((normal_waveform_data, disease_waveform_data), axis=0)
psg_data = np.concatenate((normal_psg_data, disease_psg_data), axis=0)
binary_labels = np.concatenate((normal_labels, disease_binary_labels), axis=0)
multi_labels = np.concatenate((normal_labels, disease_multi_labels), axis=0)
# Shuffle data
p = np.random.permutation(len(psg_data))
# derived_data = derived_data[p]
# waveform_data = waveform_data[p]
psg_data = psg_data[p]
binary_labels = binary_labels[p]
multi_labels = multi_labels[p]
# Choosing PSG data
X = psg_data
if classification == 1:
# le = LabelEncoder()
# y = le.fit_transform(multi_labels)
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
y = ohe.fit_transform(np.reshape(multi_labels, (-1, 1)))
else: # Choosing binary labels for binary classification
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
y = ohe.fit_transform(np.reshape(binary_labels, (-1, 1)))
# le = LabelEncoder()
# y = le.fit_transform(y)
# x_augmented, y_augmented = augment(*remove_nan(x, y))
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_split, random_state=42, shuffle=True)
print("Train data shape:", x_train.shape)
print("Test data shape:", x_test.shape)
print("Train label shape:", y_train.shape)
print("Test label shape:", y_test.shape)
print("Class wise count:")
print(np.unique(y, return_counts=True))
return x_train, y_train, x_test, y_test
def create_patient_map_features(file_path: str) -> dict:
# The following code re-assembles the time series related to each patient
# Loading the dataset
with open(file_path, 'rb') as file:
dataset = pickle.load(file)
# given a patient, the map returns a map that, given feature, returns its whole time series
patient_map_features = {}
pbar = tqdm(desc="Processed patients", total=len(dataset['patient'].unique()))
for pat in dataset['patient'].unique():
temp = dataset[dataset['patient'] == pat]
feature_map_ts = {}
for col in dataset.columns[1:]:
if 'signal' not in col and 'PSG_' not in col:
feature_map_ts[col] = temp[col].values
else:
feature_map_ts[col] = np.concatenate(temp[col].values)
patient_map_features[pat] = feature_map_ts
pbar.update(1)
pbar.close()
return patient_map_features
def convert_to_numpy_dataset(map: dict, window_seconds=60) -> dict[str, ndarray]:
# The following code generates a set of numpy arrays that can be used for machine learning purposes
# Each row of every numpy array contains information related to windows of length "window_seconds"
# Here, "PSG_" signal data is not considered
# window length, in seconds, of each data window.
# Here, windows are disjoint, but it might be useful to generate overlapping windows so to consider
derived_data = [] # will be a numpy array containing ECG and PPG derived data
waveform_data = [] # will be a numpy array containing ECG and PPG waveform data
psg_data = [] # will be a numpy array containing PSG data
label_data = [] # will be a numpy array containing the "event" and anomaly" data labels
list_patients = [] # list that keeps track of the patient ID for each row in the arrays
list_derived_columns = [] # list that keeps track of the column names in the derived_data numpy array
list_waveform_columns = [] # list that keeps track of the column names in the waveform_data numpy array
list_psg_columns = [] # List that keeps track of the column names in the psg_data numpy array (lasy dimension)
list_label_columns = [] # list that keeps track of the column names in the label_data numpy array (last dimension)
pbar = tqdm(desc="Processed patients", total=len(map.keys()))
for pat in map.keys():
num_values = len(map[pat]['HR(bpm)'])
max_values = (num_values // window_seconds) * window_seconds
print("Patient", pat, " > Discarding the last", num_values - max_values, 'seconds.')
# Derived data
temp_list_derived = []
derived_colnames = []
for col in map[pat]:
if np.any([x == col for x in ['HR(bpm)', 'SpO2(%)', 'PI(%)', 'RR(rpm)', 'PVCs(/min)']]):
temp_list_derived.append(
np.asarray(map[pat][col][:max_values]).reshape(-1, window_seconds))
derived_colnames.append(col)
temp_list_derived = np.moveaxis(np.asarray(temp_list_derived), [0, 1, 2], [2, 0, 1])
derived_data.append(temp_list_derived)
if len(list_derived_columns) == 0:
list_derived_columns = derived_colnames
# Waveform data
temp_list_waveform = []
waveform_colnames = []
for col in map[pat]:
if 'signal' in col:
temp_list_waveform.append(
np.asarray(map[pat][col][:max_values * 80]).reshape(-1, window_seconds, 80))
waveform_colnames.append(col)
temp_list_waveform = np.moveaxis(np.asarray(temp_list_waveform), [0, 1, 2, 3], [3, 0, 1, 2])
waveform_data.append(temp_list_waveform)
if len(list_waveform_columns) == 0:
list_waveform_columns = waveform_colnames
# PSG data
temp_list_psg = []
psg_colnames = []
for col in map[pat]:
if 'PSG_' in col:
temp_list_psg.append(
np.asarray(map[pat][col][:max_values * 10]).reshape(-1, window_seconds, 10))
psg_colnames.append(col)
temp_list_psg = np.moveaxis(np.asarray(temp_list_psg), [0, 1, 2, 3], [3, 0, 1, 2])
psg_data.append(temp_list_psg)
if len(list_psg_columns) == 0:
list_psg_columns = psg_colnames
# Label data
temp_list_label = []
label_colnames = []
for col in map[pat]:
if col == 'anomaly' or col == 'event':
temp_list_label.append(
np.asarray(map[pat][col][:max_values]).reshape(-1, window_seconds))
label_colnames.append(col)
temp_list_label = np.moveaxis(np.asarray(temp_list_label), [0, 1, 2], [2, 0, 1])
label_data.append(temp_list_label)
if len(list_label_columns) == 0:
list_label_columns = label_colnames
# Auxiliary data that keeps track of the patient related to each row
list_patients.extend([pat] * temp_list_derived.shape[0])
pbar.update(1)
pbar.close()
derived_data = np.vstack(derived_data) # 16008 windows, of 60 seconds each, for 5 attributes
waveform_data = np.vstack(
waveform_data) # 16008 windows, of 60 seconds each, 80 values per second, for 4 attributes
psg_data = np.vstack(psg_data) # 16008 windows, of 60 seconds each, 10 values per second, for 5 attributes
label_data = np.vstack(label_data) # 16008 windows, of 60 seconds each, for 2 labels
print(derived_data.shape, waveform_data.shape, psg_data.shape, label_data.shape)
print(len(list_patients), len(list_derived_columns), len(list_waveform_columns), len(list_psg_columns),
len(list_label_columns))
return {'derived_data': derived_data, 'waveform_data': waveform_data, 'psg_data': psg_data,
'label_data': label_data}
if __name__ == '__main__':
x_train, y_train, x_test, y_test = load_data(FILE_NAMES)
# create_patient_map_features('data/dataset_OSAS.pickle')