-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpreprocess_utils.py
194 lines (147 loc) · 8.25 KB
/
preprocess_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import pickle
import gzip
import copy
import random, time
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from scipy.stats import rankdata
import networkx as nx
import pandas as pd
from collections import defaultdict,Counter
from datetime import datetime, date
from itertools import combinations
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
from features_utils import *
def prepare_split_datasets(full_train_data, user_parameter, logs_file_name):
"""
Split the whole dataset (unconnected pairs) into two classes, positive and negative samples
Parameters:
- full_train_data: full dataset for training or testing
- user_parameter: some user parameters whcih are num_class, IR_num, split_type, out_norm
- logs_file_name: the folder to store the figure, which is usually defined from t0_c2_curve created from make_folders()
return:
data_subsets: negative and postive samples
"""
num_class, IR_num, split_type, out_norm = user_parameter
time_start=time.time()
data_subsets = []
if split_type==0: ## IR_num=[100]; namely citation >=IR and citation<IR; binary case 1
data_subsets.append(full_train_data[full_train_data[:, 2] < IR_num[0]]) # get all the negative samples, which are the citations < IR_num[0]
data_subsets.append(full_train_data[full_train_data[:, 2] >= IR_num[0]]) # get all the positive samples, which are the citations >= IR_num[0]
else: # conditional case
for i in range(len(IR_num)-1): # e.g., IR_num=[[0,5], 100]; namely citation >=100 and citation in [0,5]
data_subsets.append(full_train_data[(full_train_data[:, 2] >= IR_num[i][0]) & (full_train_data[:, 2] <= IR_num[i][1])])
data_subsets.append(full_train_data[full_train_data[:, 2] >= IR_num[-1]])
if split_type==0:
if len(data_subsets[0])<=3*10**7: # usually not meet, due to the size of data_subsets[0] is more than 600million
num_row_chose=len(data_subsets[0])
else:
num_row_chose=min(len(data_subsets[0]),len(data_subsets[1])) # take the same size as the positive cases
else: # conditional case
num_row_chose=len(data_subsets[0])
indices = np.random.choice(data_subsets[0].shape[0], size=num_row_chose, replace=False)
data_subsets[0] = data_subsets[0][indices] # randomly chose num_row_chose negative samples
print(f"dataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s")
with open(logs_file_name+"_logs.txt", "a") as myfile:
myfile.write(f"\ndataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s")
return data_subsets
def shuffle_split_datasets(data_subsets, train_valid_test_size):
"""
Split the dataset into training and testing sets
Parameters:
- data_subsets: the prepared negative and positive samples (unconnected pairs)
- train_valid_test_size: split ratio, here is train_valid_test_size=[0.85, 0.15, 0.0]; 85% for training, 15% for testing; evaluation part is using future data
return:
dataset_train: training dataset
dataset_test: testing dataset
"""
dataset_train = []
dataset_test = []
for subset in data_subsets:
np.random.shuffle(subset)
idx_train = int(len(subset) * train_valid_test_size[0])
train_set = subset[:idx_train]
test_set = subset[idx_train:]
dataset_train.append(train_set)
dataset_test.append(test_set)
return dataset_train, dataset_test
def get_pair_solution_datasets(data_subsets, hyper_parameter, user_parameter, logs_file_name):
"""
prepare the unconnected pairs in year y and their corresponding future citations solutions
Parameters:
- data_subsets: the prepared negative and positive samples (unconnected pairs)
- hyper_parameter: batch_size, lr_enc, rnd_seed
- user_parameter: not used here
- logs_file_name: txt file for logging the running status
return:
train_edge_pair: dataset (unconnected pairs)
train_edge_solution: the corresponding citations
"""
num_class, IR_num, split_type, out_norm = user_parameter # not used here
batch_size, lr_enc, rnd_seed=hyper_parameter
start_time=time.time()
for idx, subset in enumerate(data_subsets):
min_num_row=min(batch_size, len(subset))
num_new_samples = len(subset) - min_num_row ## data_subsets[0]>=batch_size
if num_new_samples<0: # data_subsets[0]<batch_size, one can set the batch_size smaller such that this part will not meet
upsamples=np.abs(num_new_samples)
new_samples_idx = np.random.choice(subset.shape[0], size=upsamples, replace=True)
data_subsets[idx]= np.concatenate([subset, subset[new_samples_idx]], axis=0)
train_data_for_checking = np.concatenate(data_subsets, axis=0)
train_edge_pair=train_data_for_checking[:,:2]
train_edge_solution=train_data_for_checking[:,2]
print(f"\nDone, data_for_checking: {len(train_data_for_checking)}; elapsed_time: {time.time() - start_time}")
with open(logs_file_name+"_logs.txt", "a") as myfile:
myfile.write(f"\nDone, data_for_checking: {len(train_data_for_checking)}; elapsed_time: {time.time() - start_time}")
return train_edge_pair, train_edge_solution
def prepare_train_data(data_pair, data_solution, all_features, user_parameter, logs_file_name):
"""
prepare the features for unconnected pairs and separate into two classes whcih are used in train_model()
Parameters:
- data_pair: unconnected pairs
- data_solution: the citation solution
- all_features: all the types of features, e.g., all_features=[node_feature, node_cfeature, pair_feature_train, pair_cfeature_train]
- user_parameter: num_class, IR_num, split_type, out_norm
- logs_file_name: txt file for logging the running status
return:
data_feature: the 141 features for each unconnected pair, 2-d numpy array, row is for each unconnected pair, 141 columns for the 141 features
data_input: separate features ; data_input[0] for negative samples; data_input[1] for postive samples
"""
num_class, IR_num, split_type, out_norm = user_parameter
data_feature=get_all_feature(all_features, data_pair, logs_file_name) # make all 141 features for each unconnected pair
data_input = []
start_time=time.time()
if split_type==0:
data_input.append(data_feature[data_solution < IR_num[0]]) # features for negative samples
data_input.append(data_feature[data_solution >= IR_num[0]]) # features for positive samples
else: # conditional case
for i in range(len(IR_num) - 1):
data_input.append(data_feature[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])])
data_input.append(data_feature[data_solution >= IR_num[-1]])
print(f"\n finish split_data_features: {time.time()-start_time}")
with open(logs_file_name+"_logs.txt", "a") as myfile:
myfile.write(f"\n finish split_data_features: {time.time()-start_time}")
return data_feature, data_input
######### classify_solution ###############
def classify_solution(data_solution, user_parameter):
"""
classfiy the citation solution into 0 and 1 classes
Parameters:
- data_solution: the citation solution
- user_parameter: num_class, IR_num, split_type, out_norm
return:
solution_arr: 0-1 numpy array, 0 for negative samples, 1 for postive samples
"""
num_class, IR_num, split_type, out_norm = user_parameter
solution_arr = np.zeros_like(data_solution)
if split_type==0: ## only binary
solution_arr[data_solution < IR_num[0]] = 0
solution_arr[data_solution >= IR_num[0]] = 1
else: # conditional case
for i in range(len(IR_num) - 1): # in this work, IR_num=[[0,5], 100]
solution_arr[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])] = i # in this case, 0
solution_arr[data_solution >= IR_num[-1]] = len(IR_num) - 1 # in this case, 1
return solution_arr