-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathdata_loader.py
82 lines (66 loc) · 2.13 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Functions for data loading.
Reference: Jinsung Yoon, William R. Zame and Mihaela van der Schaar,
"Estimating Missing Data in Temporal Data Streams Using
Multi-Directional Recurrent Neural Networks,"
in IEEE Transactions on Biomedical Engineering,
vol. 66, no. 5, pp. 1477-1490, May 2019.
Paper Link: https://ieeexplore.ieee.org/document/8485748
Contact: jsyoon0823@gmail.com
"""
# Necessary packages
import numpy as np
from utils import MinMaxScaler
def data_loader (file_name = 'data/google.csv', seq_len = 7,
missing_rate = 0.2):
"""Load complete data and introduce missingness.
Args:
- file_name: the location of file to be loaded
- seq_len: sequence length
- missing_rate: rate of missing data to be introduced
Returns:
- x: data with missing values
- m: observation indicator (m=1: observe, m=0: missing)
- t: time information (time difference between two measurments)
- ori_x: original data without missing values (for evaluation)
"""
# Load the dataset
data = np.loadtxt(file_name, delimiter = ",", skiprows = 1)
# Reverse time order
data = data[::-1]
# Normalize the data
data, norm_parameters = MinMaxScaler(data)
# Parameters
no, dim = data.shape
no = no - seq_len
# Define original data
ori_x = list()
for i in range(no):
temp_ori_x = data[i:(i+seq_len)]
ori_x = ori_x + [temp_ori_x]
# Introduce missingness
m = list()
x = list()
t = list()
for i in range(no):
# m
temp_m = 1*(np.random.uniform(0, 1, [seq_len, dim]) > missing_rate)
m = m + [temp_m]
# x
temp_x = ori_x[i].copy()
temp_x[np.where(temp_m == 0)] = np.nan
x = x + [temp_x]
# t
temp_t = np.ones([seq_len, dim])
for j in range(dim):
for k in range(1, seq_len):
if temp_m[k, j] == 0:
temp_t[k, j] = temp_t[k-1, j] + 1
t = t + [temp_t]
# Convert into 3d numpy array
x = np.asarray(x)
m = np.asarray(m)
t = np.asarray(t)
ori_x = np.asarray(ori_x)
# Fill 0 to the missing values
x = np.nan_to_num(x, 0)
return x, m, t, ori_x