1
+ from dataset .Dataset import DataSet
2
+
3
+
4
+ # Amazon review dataset
5
+ class Electronics (DataSet ):
6
+ def __init__ (self ):
7
+ self .dir_path = './dataset/data/amazon/Electronics/'
8
+ self .user_record_file = 'Electronics_user_records.pkl'
9
+ self .user_mapping_file = 'Electronics_user_mapping.pkl'
10
+ self .item_mapping_file = 'Electronics_item_mapping.pkl'
11
+ self .item_content_file = 'word_counts.txt'
12
+ self .item_relation_file = 'item_relation.pkl'
13
+
14
+ # data structures used in the model
15
+ self .num_users = 37204
16
+ self .num_items = 13881
17
+ self .vocab_size = 10104
18
+
19
+ self .user_records = None
20
+ self .user_mapping = None
21
+ self .item_mapping = None
22
+
23
+ def generate_dataset (self , seed = 0 ):
24
+ user_records = self .load_pickle (self .dir_path + self .user_record_file )
25
+ user_mapping = self .load_pickle (self .dir_path + self .user_mapping_file )
26
+ item_mapping = self .load_pickle (self .dir_path + self .item_mapping_file )
27
+
28
+ self .num_users = len (user_mapping )
29
+ self .num_items = len (item_mapping )
30
+
31
+ inner_data_records , user_inverse_mapping , item_inverse_mapping = \
32
+ self .convert_to_inner_index (user_records , user_mapping , item_mapping )
33
+
34
+ train_set , test_set = self .split_data_randomly (inner_data_records , seed )
35
+
36
+ train_matrix = self .generate_rating_matrix (train_set , self .num_users , self .num_items )
37
+ # train_matrix = self.fill_zero_col(train_matrix)
38
+ item_content_matrix = self .load_item_content (self .dir_path + self .item_content_file , self .vocab_size )
39
+ item_relation_matrix = self .load_pickle (self .dir_path + self .item_relation_file )
40
+
41
+ return train_matrix , train_set , test_set , item_content_matrix , item_relation_matrix
42
+
43
+
44
+ class Books (DataSet ):
45
+ def __init__ (self ):
46
+ self .dir_path = './dataset/data/amazon/Books/'
47
+ self .user_record_file = 'Books_user_records.pkl'
48
+ self .user_mapping_file = 'Books_user_mapping.pkl'
49
+ self .item_mapping_file = 'Books_item_mapping.pkl'
50
+ self .item_content_file = 'word_counts.txt'
51
+ self .item_relation_file = 'item_relation.pkl'
52
+ self .item_word_seq_file = 'review_word_sequence.pkl'
53
+
54
+ # data structures used in the model
55
+ self .num_users = 65476
56
+ self .num_items = 41264
57
+ self .vocab_size = 27584
58
+
59
+ self .user_records = None
60
+ self .user_mapping = None
61
+ self .item_mapping = None
62
+
63
+ def generate_dataset (self , seed = 0 ):
64
+ user_records = self .load_pickle (self .dir_path + self .user_record_file )
65
+ user_mapping = self .load_pickle (self .dir_path + self .user_mapping_file )
66
+ item_mapping = self .load_pickle (self .dir_path + self .item_mapping_file )
67
+ word_seq = self .load_pickle (self .dir_path + self .item_word_seq_file )
68
+
69
+ self .num_users = len (user_mapping )
70
+ self .num_items = len (item_mapping )
71
+
72
+ inner_data_records , user_inverse_mapping , item_inverse_mapping = \
73
+ self .convert_to_inner_index (user_records , user_mapping , item_mapping )
74
+
75
+ train_set , test_set = self .split_data_randomly (inner_data_records , seed )
76
+
77
+ train_matrix = self .generate_rating_matrix (train_set , self .num_users , self .num_items )
78
+ item_content_matrix = self .load_item_content (self .dir_path + self .item_content_file , self .vocab_size )
79
+ item_relation_matrix = self .load_pickle (self .dir_path + self .item_relation_file )
80
+
81
+ return train_matrix , train_set , test_set , item_content_matrix , item_relation_matrix , word_seq
82
+
83
+
84
+ class CDs (DataSet ):
85
+ def __init__ (self ):
86
+ self .dir_path = './dataset/data/amazon/CDs/'
87
+ self .user_record_file = 'CDs_user_records.pkl'
88
+ self .user_mapping_file = 'CDs_user_mapping.pkl'
89
+ self .item_mapping_file = 'CDs_item_mapping.pkl'
90
+ self .item_content_file = 'word_counts.txt'
91
+ self .item_relation_file = 'item_relation.pkl'
92
+ self .item_word_seq_file = 'review_word_sequence.pkl'
93
+
94
+ # data structures used in the model
95
+ self .num_users = 24934
96
+ self .num_items = 24634
97
+ self .vocab_size = 24341
98
+
99
+ self .user_records = None
100
+ self .user_mapping = None
101
+ self .item_mapping = None
102
+
103
+ def generate_dataset (self , seed = 0 ):
104
+ user_records = self .load_pickle (self .dir_path + self .user_record_file )
105
+ user_mapping = self .load_pickle (self .dir_path + self .user_mapping_file )
106
+ item_mapping = self .load_pickle (self .dir_path + self .item_mapping_file )
107
+ word_seq = self .load_pickle (self .dir_path + self .item_word_seq_file )
108
+
109
+ self .num_users = len (user_mapping )
110
+ self .num_items = len (item_mapping )
111
+
112
+ inner_data_records , user_inverse_mapping , item_inverse_mapping = \
113
+ self .convert_to_inner_index (user_records , user_mapping , item_mapping )
114
+
115
+ train_set , test_set = self .split_data_randomly (inner_data_records , seed )
116
+
117
+ train_matrix = self .generate_rating_matrix (train_set , self .num_users , self .num_items )
118
+ item_content_matrix = self .load_item_content (self .dir_path + self .item_content_file , self .vocab_size )
119
+ item_relation_matrix = self .load_pickle (self .dir_path + self .item_relation_file )
120
+
121
+ return train_matrix , train_set , test_set , item_content_matrix , item_relation_matrix , word_seq
122
+
123
+
124
+ if __name__ == '__main__' :
125
+ data_set = CDs ()
126
+ train_matrix , train_set , test_set , item_content_matrix , item_relation_matrix , word_seq = data_set .generate_dataset ()
127
+ print (word_seq [- 1 ])
128
+ max_len = 0
129
+ for word_list in word_seq :
130
+ max_len = max (len (word_list ), max_len )
131
+ print (max_len )
132
+ for i in range (item_content_matrix .shape [0 ]):
133
+ if item_content_matrix .getrow (i ).getnnz () == 0 :
134
+ print (i )
0 commit comments