Skip to content

Commit 10a6f7e

Browse files
committedNov 26, 2018
Code and data ready
1 parent 0faf697 commit 10a6f7e

24 files changed

+12400821
-0
lines changed
 

‎dataset/Amazon.py

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
from dataset.Dataset import DataSet
2+
3+
4+
# Amazon review dataset
5+
class Electronics(DataSet):
6+
def __init__(self):
7+
self.dir_path = './dataset/data/amazon/Electronics/'
8+
self.user_record_file = 'Electronics_user_records.pkl'
9+
self.user_mapping_file = 'Electronics_user_mapping.pkl'
10+
self.item_mapping_file = 'Electronics_item_mapping.pkl'
11+
self.item_content_file = 'word_counts.txt'
12+
self.item_relation_file = 'item_relation.pkl'
13+
14+
# data structures used in the model
15+
self.num_users = 37204
16+
self.num_items = 13881
17+
self.vocab_size = 10104
18+
19+
self.user_records = None
20+
self.user_mapping = None
21+
self.item_mapping = None
22+
23+
def generate_dataset(self, seed=0):
24+
user_records = self.load_pickle(self.dir_path + self.user_record_file)
25+
user_mapping = self.load_pickle(self.dir_path + self.user_mapping_file)
26+
item_mapping = self.load_pickle(self.dir_path + self.item_mapping_file)
27+
28+
self.num_users = len(user_mapping)
29+
self.num_items = len(item_mapping)
30+
31+
inner_data_records, user_inverse_mapping, item_inverse_mapping = \
32+
self.convert_to_inner_index(user_records, user_mapping, item_mapping)
33+
34+
train_set, test_set = self.split_data_randomly(inner_data_records, seed)
35+
36+
train_matrix = self.generate_rating_matrix(train_set, self.num_users, self.num_items)
37+
# train_matrix = self.fill_zero_col(train_matrix)
38+
item_content_matrix = self.load_item_content(self.dir_path + self.item_content_file, self.vocab_size)
39+
item_relation_matrix = self.load_pickle(self.dir_path + self.item_relation_file)
40+
41+
return train_matrix, train_set, test_set, item_content_matrix, item_relation_matrix
42+
43+
44+
class Books(DataSet):
45+
def __init__(self):
46+
self.dir_path = './dataset/data/amazon/Books/'
47+
self.user_record_file = 'Books_user_records.pkl'
48+
self.user_mapping_file = 'Books_user_mapping.pkl'
49+
self.item_mapping_file = 'Books_item_mapping.pkl'
50+
self.item_content_file = 'word_counts.txt'
51+
self.item_relation_file = 'item_relation.pkl'
52+
self.item_word_seq_file = 'review_word_sequence.pkl'
53+
54+
# data structures used in the model
55+
self.num_users = 65476
56+
self.num_items = 41264
57+
self.vocab_size = 27584
58+
59+
self.user_records = None
60+
self.user_mapping = None
61+
self.item_mapping = None
62+
63+
def generate_dataset(self, seed=0):
64+
user_records = self.load_pickle(self.dir_path + self.user_record_file)
65+
user_mapping = self.load_pickle(self.dir_path + self.user_mapping_file)
66+
item_mapping = self.load_pickle(self.dir_path + self.item_mapping_file)
67+
word_seq = self.load_pickle(self.dir_path + self.item_word_seq_file)
68+
69+
self.num_users = len(user_mapping)
70+
self.num_items = len(item_mapping)
71+
72+
inner_data_records, user_inverse_mapping, item_inverse_mapping = \
73+
self.convert_to_inner_index(user_records, user_mapping, item_mapping)
74+
75+
train_set, test_set = self.split_data_randomly(inner_data_records, seed)
76+
77+
train_matrix = self.generate_rating_matrix(train_set, self.num_users, self.num_items)
78+
item_content_matrix = self.load_item_content(self.dir_path + self.item_content_file, self.vocab_size)
79+
item_relation_matrix = self.load_pickle(self.dir_path + self.item_relation_file)
80+
81+
return train_matrix, train_set, test_set, item_content_matrix, item_relation_matrix, word_seq
82+
83+
84+
class CDs(DataSet):
85+
def __init__(self):
86+
self.dir_path = './dataset/data/amazon/CDs/'
87+
self.user_record_file = 'CDs_user_records.pkl'
88+
self.user_mapping_file = 'CDs_user_mapping.pkl'
89+
self.item_mapping_file = 'CDs_item_mapping.pkl'
90+
self.item_content_file = 'word_counts.txt'
91+
self.item_relation_file = 'item_relation.pkl'
92+
self.item_word_seq_file = 'review_word_sequence.pkl'
93+
94+
# data structures used in the model
95+
self.num_users = 24934
96+
self.num_items = 24634
97+
self.vocab_size = 24341
98+
99+
self.user_records = None
100+
self.user_mapping = None
101+
self.item_mapping = None
102+
103+
def generate_dataset(self, seed=0):
104+
user_records = self.load_pickle(self.dir_path + self.user_record_file)
105+
user_mapping = self.load_pickle(self.dir_path + self.user_mapping_file)
106+
item_mapping = self.load_pickle(self.dir_path + self.item_mapping_file)
107+
word_seq = self.load_pickle(self.dir_path + self.item_word_seq_file)
108+
109+
self.num_users = len(user_mapping)
110+
self.num_items = len(item_mapping)
111+
112+
inner_data_records, user_inverse_mapping, item_inverse_mapping = \
113+
self.convert_to_inner_index(user_records, user_mapping, item_mapping)
114+
115+
train_set, test_set = self.split_data_randomly(inner_data_records, seed)
116+
117+
train_matrix = self.generate_rating_matrix(train_set, self.num_users, self.num_items)
118+
item_content_matrix = self.load_item_content(self.dir_path + self.item_content_file, self.vocab_size)
119+
item_relation_matrix = self.load_pickle(self.dir_path + self.item_relation_file)
120+
121+
return train_matrix, train_set, test_set, item_content_matrix, item_relation_matrix, word_seq
122+
123+
124+
if __name__ == '__main__':
125+
data_set = CDs()
126+
train_matrix, train_set, test_set, item_content_matrix, item_relation_matrix, word_seq = data_set.generate_dataset()
127+
print(word_seq[-1])
128+
max_len = 0
129+
for word_list in word_seq:
130+
max_len = max(len(word_list), max_len)
131+
print(max_len)
132+
for i in range(item_content_matrix.shape[0]):
133+
if item_content_matrix.getrow(i).getnnz() == 0:
134+
print(i)

‎dataset/Dataset.py

+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import pickle
2+
import math
3+
4+
import numpy as np
5+
from scipy.sparse import csr_matrix
6+
from sklearn.model_selection import train_test_split
7+
8+
9+
class DataSet(object):
10+
def load_pickle(self, name):
11+
with open(name, 'rb') as f:
12+
return pickle.load(f, encoding='latin1')
13+
14+
def generate_inverse_mapping(self, data_list):
15+
inverse_mapping = dict()
16+
for inner_id, true_id in enumerate(data_list):
17+
inverse_mapping[true_id] = inner_id
18+
return inverse_mapping
19+
20+
def convert_to_inner_index(self, user_records, user_mapping, item_mapping):
21+
inner_user_records = []
22+
user_inverse_mapping = self.generate_inverse_mapping(user_mapping)
23+
item_inverse_mapping = self.generate_inverse_mapping(item_mapping)
24+
25+
for user_id in range(len(user_mapping)):
26+
real_user_id = user_mapping[user_id]
27+
item_list = list(user_records[real_user_id])
28+
for index, real_item_id in enumerate(item_list):
29+
item_list[index] = item_inverse_mapping[real_item_id]
30+
inner_user_records.append(item_list)
31+
32+
return inner_user_records, user_inverse_mapping, item_inverse_mapping
33+
34+
def split_data_randomly(self, user_records, seed=0):
35+
# randomly hold part of the data as the test set
36+
test_ratio = 0.2
37+
train_set = []
38+
test_set = []
39+
for user_id, item_list in enumerate(user_records):
40+
tmp_train_sample, tmp_test_sample = train_test_split(item_list, test_size=test_ratio, random_state=seed)
41+
42+
train_sample = []
43+
for place in item_list:
44+
if place not in tmp_test_sample:
45+
train_sample.append(place)
46+
47+
test_sample = []
48+
for place in tmp_test_sample:
49+
test_sample.append(place)
50+
51+
train_set.append(train_sample)
52+
test_set.append(test_sample)
53+
return train_set, test_set
54+
55+
def split_data_sequentially(self, user_records):
56+
test_radio = 0.2
57+
train_set = []
58+
test_set = []
59+
60+
for item_list in user_records:
61+
len_list = len(item_list)
62+
num_test_samples = int(math.ceil(len_list * test_radio))
63+
train_sample = []
64+
test_sample = []
65+
for i in range(len_list - num_test_samples, len_list):
66+
test_sample.append(item_list[i])
67+
68+
for place in item_list:
69+
if place not in set(test_sample):
70+
train_sample.append(place)
71+
72+
train_set.append(train_sample)
73+
test_set.append(test_sample)
74+
75+
return train_set, test_set
76+
77+
def generate_rating_matrix(self, train_set, num_users, num_items):
78+
# three lists are used to construct sparse matrix
79+
row = []
80+
col = []
81+
data = []
82+
for user_id, article_list in enumerate(train_set):
83+
for article in article_list:
84+
row.append(user_id)
85+
col.append(article)
86+
data.append(1)
87+
88+
row = np.array(row)
89+
col = np.array(col)
90+
data = np.array(data)
91+
rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))
92+
93+
return rating_matrix
94+
95+
def load_item_content(self, f_in, D=8000):
96+
fp = open(f_in)
97+
lines = fp.readlines()
98+
X = np.zeros((len(lines), D))
99+
for i, line in enumerate(lines):
100+
strs = line.strip().split(' ')[2:]
101+
for strr in strs:
102+
segs = strr.split(':')
103+
X[i, int(segs[0])] = float(segs[1])
104+
105+
return csr_matrix(X)

‎dataset/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)