-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathUtilRecommender.py
983 lines (846 loc) · 31.8 KB
/
UtilRecommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn import metrics
import copy
import itertools
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm_notebook
from scipy import spatial
import logging
import os
import sys
from pandas.api.types import CategoricalDtype
class MFDataClean(object):
def __init__(self, df, user_col, item_col, item_name, interaction_col):
self.raw_df = df
self.user_col = user_col
self.item_col = item_col
self.item_name = item_name
self.interaction_col = interaction_col
self.raw_df[user_col] = self.raw_df[user_col].astype(int)
self.raw_df[item_col] = self.raw_df[item_col].astype(int)
self.raw_df[item_name] = self.raw_df[item_name].astype(str)
self.raw_df[interaction_col] = self.raw_df[interaction_col].astype(float)
self.clean_data = self.subset(
self.raw_df,
self.user_col,
self.item_col,
self.item_name,
self.interaction_col
)
self.clean_data = self.drop_missing(
self.clean_data,
self.user_col,
self.item_col,
self.interaction_col
)
self.item_lookup = self.create_item_lookup(
self.clean_data,
self.item_col,
self.item_name
)
self.clean_data = self.clean_data.drop(
labels=self.item_name, axis=1) # Remove col of item name from interaction dataset.
def subset(self, df, *cols):
subset = df.loc[:,cols]
return subset
def drop_missing(self, df, *cols):
clean_df = df.copy()
for col in cols:
clean_df = clean_df.loc[pd.isnull(clean_df[col]) == False]
return clean_df
def create_item_lookup(self, df, item_col, item_name):
item_lookup = df[[item_col, item_name]].drop_duplicates().reset_index(
drop=True) # Only get unique item/description pairs
return item_lookup
def threshold_interact(df, user_col, item_col, uid_min, iid_min):
"""
REF: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
Return dataframe which every user has a least interacted with items more than iid_min required,
and item has a least interacted with users more than uid_min required.
@params
df: dataframe of columns (user, item, interaction_num)
user_col (str): user column name
item_col (str): item column name
uid_min (int): minimal items user needs to interact with
iid_min (int): minimal users item needs to interact with
@return
df
"""
n_users = df[user_col].unique().shape[0]
n_items = df[item_col].unique().shape[0]
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Starting likes info')
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
done = False
while not done:
starting_shape = df.shape[0]
# Drop off users who have interaction with items fewer than iid_min.
iid_counts = df.groupby(user_col)[item_col].count()
df = df[~df[user_col].isin(iid_counts[
iid_counts < uid_min].index.tolist())]
# Drop off items which have interaction with users fewer than uid_min.
uid_counts = df.groupby(item_col)[user_col].count()
df = df[~df[item_col].isin(uid_counts[
uid_counts < iid_min].index.tolist())]
ending_shape = df.shape[0]
if starting_shape == ending_shape:
done = True
elif (df[user_col].unique().shape[0]) < 10 or (df[item_col].unique().shape[0]) < 10:
done = True # too few user and items left, STOP!!
if (df[user_col].unique().shape[0]) >= 10 or (df[item_col].unique().shape[0]) >= 10:
assert (df.groupby(user_col)[item_col].count().min() >= uid_min)
assert (df.groupby(item_col)[user_col].count().min() >= iid_min)
n_users = df[user_col].unique().shape[0]
n_items = df[item_col].unique().shape[0]
sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
print('Ending likes info')
print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
return df
def df2interact_mat(df, user_col, item_col, interact_col):
"""
Convert (user, item, interact) dataframe to sparse interaction matrix.
REF: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html
***** csr_matrix((data, ij), [shape=(M, N)]) *****
where data and ij satisfy the relationship a[ij[0, k], ij[1, k]] = data[k]
:param df: dataframe of (user, item, interact)
:param user_col: (string) user_id column name
:param item_col: (string) item_id column name
:param interact_col: (string) interaction column name
:return:
interaction_sparse: (csr) interaction matrix
users: (numpy array) user_id
items: (numpy array) item_id
"""
# Extract unique ids.
users = np.array(list(np.sort(
df[user_col].unique()))) # Get our unique users
items = np.array(list(
df[item_col].unique())) # Get our unique items that were interacted with
interactions = list(
df[interact_col]) # All of our interactions
# Re-code user and item index for sparse matrix.
rows = df[user_col].astype(CategoricalDtype(
categories=users)).cat.codes # Re-code user index starting from 0 for sparse matrix.
cols = df[item_col].astype(CategoricalDtype(
categories=items)).cat.codes
# Construct sparse matrix.
# Get the associated column indices
interaction_sparse = sparse.csr_matrix(
(interactions, (rows, cols)), shape=(len(users), len(items))
)
return interaction_sparse, users, items
def train_test_split(ratings, split_count, fraction=None, dir_path=None, output_file_long=None):
"""
Split recommendation data into train and test sets,
where non-overlapping non-zero entries are split into train and test sets.
REF: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
Params
------
ratings : scipy.sparse matrix
Interactions between users and items.
split_count : int
Number of user-item-interactions per user to move
from training to test set.
fractions : float
Fraction of users to split off some of their
interactions into test set. If None, then all
users are considered.
dir_path : string
directory where dataframe to be saved to
output_file_long : string
file name
Return
train (csr)
test (csr)
user_index: (list) user indices on whom some interactions are masked for testing set
"""
# Note: likely not the fastest way to do things below.
train = ratings.copy().tocoo()
train.eliminate_zeros()
test = sparse.lil_matrix(train.shape)
if fraction:
try:
user_index = np.random.choice(
np.where(np.bincount(train.row) >= split_count * 2)[0],
replace=False,
size=np.int32(np.floor(fraction * train.shape[0]))).tolist()
logging.info('length of user_index: {}'.format(len(user_index)))
except BaseException as e:
logging.info('Exception: {}'.format(e))
logging.info(
'Not enough users with >= {}, '
'interactions for fraction of {}'.format(
2 * split_count, fraction)
)
logging.info('Stop building models and exit program because not enough users for test...')
empty_df = pd.DataFrame(columns=['company', 'model', 'member_id', 'product_id', 'title', 'score'])
empty_df.to_csv(os.path.join(dir_path, output_file_long), index=False)
sys.exit(0)
else:
user_index = range(train.shape[0])
train = train.tolil()
for user in user_index:
test_ratings = np.random.choice(
ratings.getrow(user).indices, size=split_count,
replace=False) # indices for nonzero cols
train[user, test_ratings] = 0.
# These are just 1.0 right now
test[user, test_ratings] = ratings[user, test_ratings]
logging.info('Length of test non-zero rows: {}'.format(len(np.unique(test.nonzero()[0]))))
# Test and training are truly disjoint
assert (train.multiply(test).nnz == 0)
return train.tocsr(), test.tocsr(), user_index
def precision_at_k(model, true_ratings, k=5, user_index=None):
"""
Compute the percentage of top k items predicted by model are included in non-zero entries,
comparing with ratings matrix.
REF: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
:param model: (lightFM model)
:param true_ratings: (csr) interaction matrix
:param k: (int) top k preferable items
:param user_index: (list) index of matrix-split users
:return:
precisions: (float) mean top k precision for all users in user_index
"""
if not user_index:
user_index = range(true_ratings.shape[0])
precisions = []
for user in user_index:
# In case of large dataset, compute predictions row-by-row like below
predictions = np.array(
model.predict([user], list(range(true_ratings.shape[1])))) # preference prediction for all items
top_k = np.argsort(-predictions)[:k] # top k item index
labels = true_ratings.getrow(user).indices # non-zero index
precision = float(len(set(top_k) & set(labels))) / float(k)
precisions.append(precision)
return np.mean(precisions)
def auc_mean(model, train, test=None, user_index=None, mode='train'):
"""
Compute mean accuracy.
In train mode, only entries in train are used, while in test mode,
entries excluding non-zero train entries are used.
:param model: (lightFM model)
:param train: (csr)
:param test: (csr)
:param user_index: (list) index of matrix-split users
:param mode: (string) train/test mode
:return:
auc: (float) mean of accuracy for all users in user_index
"""
if not user_index:
user_index = range(train.shape[0])
auc = []
for user in user_index:
preds = np.array(model.predict([user], list(range(train.shape[1]))))
if mode == 'train':
col_indices = list(range(train.shape[1])) # entries from train matrix
actual = train[user, col_indices].toarray()[0] # list of list
actual[actual > 0] = 1
pred = preds[col_indices]
elif mode == 'test':
col_indices = set(range(train.shape[1])).difference(
set(train[user].nonzero()[1])) # entries excluding train matrix
col_indices = list(col_indices)
actual = test[user, col_indices].toarray()[0] # list of list
actual[actual > 0] = 1
pred = preds[col_indices]
fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
auc.append(metrics.auc(fpr, tpr))
return np.mean(auc)
def print_log(row, header=False, spacing=12):
"""
Helper function to print log per line.
:param row: (list) list of metric values
:param header: (boolean) indicator to print header
:param spacing: (int) spacing
:return:
No return.
"""
top = ''
middle = ''
bottom = ''
for r in row:
top += '+{}'.format('-' * spacing)
if isinstance(r, str):
middle += '| {0:^{1}} '.format(r, spacing - 2)
elif isinstance(r, int):
middle += '| {0:^{1}} '.format(r, spacing - 2)
elif isinstance(r, float):
middle += '| {0:^{1}.5f} '.format(r, spacing - 2)
bottom += '+{}'.format('=' * spacing)
top += '+'
middle += '|'
bottom += '+'
if header:
print(top)
print(middle)
print(bottom)
else:
print(middle)
print(top)
def learning_curve_predict_k(model, train, all_data, epochs=range(2, 40, 2), k=5, user_index=None):
"""
Helper function to depict the improvement as training iteration goes on at top k precision.
:param model: (lightFM model)
:param train: (csr)
:param all_data: (csr) original interaction matrix
:param epochs: (iterator)
:param k: (int) top k for precision coverage
:param user_index: (list) index of matrix-split users
:return:
model: (lightFM model)
train_precision: (list)
all_data_precision: (list)
"""
if not user_index:
user_index = range(train.shape[0])
prev_epoch = 0
train_precision = []
all_data_precision = []
headers = ['epochs', 'p@k train', 'p@k all_data']
print_log(headers, header=True)
for epoch in epochs:
iterations = epoch - prev_epoch # num of iteration to train
model.fit_partial(train, epochs=iterations)
train_precision.append(precision_at_k(model, train, k, user_index)) # p@k on train set only
all_data_precision.append(precision_at_k(model, all_data, k, user_index)) # p@k on all data
row = [epoch, train_precision[-1], all_data_precision[-1]]
print_log(row)
prev_epoch = epoch
return model, train_precision, all_data_precision
def learning_curve_auc(model, train, test, epochs=range(2, 40, 2), user_index=None):
"""
Helper function to depict the improvement as training iteration goes on at accuracy.
:param model: (lightFM model)
:param train: (csr)
:param test: (csr)
:param epochs: (iterator)
:param user_index: (list) index of matrix-split users
:return:
model: (lightFM model)
train_auc: (list)
test_auc: (list)
"""
if not user_index:
user_index = range(train.shape[0])
prev_epoch = 0
train_auc = []
test_auc = []
headers = ['epochs', 'auc train', 'auc test']
print_log(headers, header=True)
for epoch in epochs:
iterations = epoch - prev_epoch # num of iteration to train
model.fit_partial(train, epochs=iterations)
train_auc.append(
auc_mean(model, train, user_index=user_index, mode='train')
)
test_auc.append(
auc_mean(model, train, test, user_index=user_index, mode='test')
)
row = [epoch, train_auc[-1], test_auc[-1]]
print_log(row)
prev_epoch = epoch
return model, train_auc, test_auc
def grid_search_learning_curve_predict_k(
base_model,
train,
all_data,
param_grid,
user_index=None,
patk=5,
epochs=range(2, 40, 2)
):
"""
Inspired (stolen) from sklearn gridsearch
https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py
REF: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
:param base_model: (lightFM model)
:param train: (csr)
:param all_data: (csr)
:param param_grid: (dict)
:param user_index: (list)
:param patk: (int) top k
:param epochs: (iterator)
:return:
curves: (list of dict) list of each learning curve
"""
curves = []
keys, values = zip(*param_grid.items())
# list combination of values in each parameter, such as (20, 0.01) (20, 0.05) etc.
for val in itertools.product(*values):
params = dict(zip(keys, val))
this_model = copy.deepcopy(base_model)
print_line = []
for k, v in params.items():
setattr(this_model, k, v) # set up model's hyper-parameter
print_line.append((k, v))
print(' | '.join('{}: {}'.format(k, v) for (k, v) in print_line))
_, train_patk, all_data_patk = learning_curve_predict_k(
this_model, train, all_data, epochs, k=patk, user_index=user_index)
curves.append({
'params': params,
'metric': {
'train': train_patk,
'all_data': all_data_patk
}
})
return curves
def grid_search_learning_curve_auc(
base_model,
train,
test,
param_grid,
user_index=None,
epochs=range(2, 40, 2)
):
"""
Inspired (stolen) from sklearn gridsearch
https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_search.py
:param base_model: (lightFM model)
:param train: (csr)
:param test: (csr)
:param param_grid: (dict)
:param user_index: (list)
:param epochs: (iterator)
:return:
curves: (list of dict) list of each learning curve
"""
curves = []
keys, values = zip(*param_grid.items())
# list combination of values in each parameter, such as (20, 0.01) (20, 0.05) etc.
for val in itertools.product(*values):
params = dict(zip(keys, val))
this_model = copy.deepcopy(base_model)
print_line = []
for k, v in params.items():
setattr(this_model, k, v) # set up model's hyper-parameter
print_line.append((k, v))
print(' | '.join('{}: {}'.format(k, v) for (k, v) in print_line))
_, train_auc, test_auc = learning_curve_auc(
this_model,
train,
test,
epochs,
user_index=user_index
)
curves.append({
'params': params,
'metric': {
'train': train_auc,
'test': test_auc
}
})
return curves
def refit_model(base_model, best_params, interact_matrix, best_iters):
"""
re-train a model.
:param base_model: (lightFM model)
:param best_params: (dict) best hyper-parameters
:param interact_matrix: (csr)
:param best_iters: (int) best num of interations
:return:
model: (lightFM model)
"""
model = copy.deepcopy(base_model)
for k, v in best_params.items():
setattr(model, k, v)
model.fit(interact_matrix, epochs=best_iters)
return model
def get_items_interact(
customer_id,
true_mf,
users,
items,
item_lookup
):
"""
REF: https://jessesw.com/Rec-System/
This just tells me which items have been already interacted with by a specific user in the interaction matrix.
@parameters:
customer_id: (int) Input the customer's id number that you want to see prior purchases of at least once
true_mf: (csr) The initial ratings matrix used
users: (numpy array) The array of customers used in the ratings matrix
items: (numpy array) The array of products used in the ratings matrix
item_lookup: (dataframe) A simple pandas dataframe of the unique product ID/product descriptions available
@returns:
A list of item IDs and item descriptions for a particular customer
that were already interacted with in the interaction matrix
"""
cust_ind = np.where(users == customer_id)[0][
0] # Returns the index row of our customer id
purchased_ind = true_mf[cust_ind, :].nonzero()[
1] # Get column indices of purchased items
prod_codes = items[
purchased_ind] # Get the stock codes for our purchased items
return item_lookup.loc[item_lookup.product_id.isin(prod_codes)]
def top_interact(customer_id, users, interaction_sparse, item_lookup, top_num=3, item_name='title'):
"""
Return the top interact items for customer.
:param customer_id: (int)
:param users: (list)
:param interaction_sparse: (csr)
:param item_lookup: (list)
:param top_num: (int) top number of interacted product from customer
:param item_name: (str) column for product name
:return:
(customer_id, top_interact_items): (tuple)
"""
# Find out the top three items with most interactions.
cust_ind = np.where(users == customer_id)[0][0]
interaction_nd = np.squeeze(interaction_sparse[cust_ind, :].toarray(), axis=0) # convert to ndarray for `squeeze`
sort_item_idx = np.argsort(interaction_nd)[::-1]
item_idx_value = interaction_nd[sort_item_idx]
top_item_idx = sort_item_idx[item_idx_value > 0][:top_num]
top_interact_items = []
for idx in top_item_idx:
top_interact_items.append(item_lookup.loc[idx][item_name])
# Fill in `None` if top_interact_items is less than top_num
if top_num - len(top_interact_items) > 0:
fill_num = top_num - len(top_interact_items)
fill_none = [None for _ in range(fill_num)]
top_interact_items.extend(fill_none)
return (customer_id, top_interact_items)
def rec_items(
customer_id,
exclude_mf,
user_vecs,
item_vecs,
users,
items,
item_lookup,
num_items=10,
item_name='title',
item_col='product_id'
):
"""
REF: https://jessesw.com/Rec-System/
This function will return the top recommended items to our users
@parameters:
customer_id: (int) Input the customer's id number that you want to get recommendations for
exclude_mf: (csr) The interaction matrix you use to exclude already purchased/interacted items
user_vecs - the user vectors from your fitted matrix factorization
item_vecs - the item vectors from your fitted matrix factorization
users: (numpy array) an array of the customer's ID numbers that make up the rows of your ratings matrix
(in order of matrix)
items: (numpy array) an array of the products that make up the columns of your ratings matrix
(in order of matrix)
item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
num_items: (int) The number of items you want to recommend in order of best recommendations. Default is 10.
item_name: (str) column for product name
item_col: (str) column for product id
@returns:
The top n recommendations chosen based on the user/item vectors for items never interacted with
"""
cust_ind = np.where(users == customer_id)[0][
0] # Returns the index row of our customer id
pref_vec = exclude_mf[cust_ind, :].toarray(
) # Get the ratings from the ratings matrix
pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
rec_vector = user_vecs[cust_ind, :].dot(
item_vecs.T) # Get dot product of user vector and all item vectors
# Scale this recommendation vector between 0 and 1
min_max = MinMaxScaler()
rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0]
recommend_vector = pref_vec * rec_vector_scaled
# Items already purchased have their recommendation multiplied by zero
product_idx = np.argsort(
recommend_vector
)[::-1][:num_items] # Sort the indices of the items into order
# of best recommendations
rec_list = [] # start empty list to store items
for index in product_idx:
code = items[index]
rec_list.append([
customer_id,
code,
item_lookup[item_name].loc[item_lookup[item_col] == code].iloc[0],
recommend_vector[index]
])
# Append our descriptions to the list
customer_ids = [item[0] for item in rec_list]
codes = [item[1] for item in rec_list]
descriptions = [item[2] for item in rec_list]
scores = [item[3] for item in rec_list]
final_frame = pd.DataFrame({
'member_id':customer_ids,
'product_id': codes,
'title': descriptions,
'score': scores
}) # Create a dataframe
return final_frame
def random_rec(
customer_id,
cust_product_click,
group_data,
item_freq,
item_lookup,
num_reco=10,
user_col='member_id',
item_col='product_id',
item_name='title'
):
"""
Randomly sample recommendation list, excluding the items they have clicked/viewed.
The sampling distribution of items follows the frequency of items being viewed by users.
:param customer_id: (int)
:param cust_product_click: (series) index of customer with number of product they've clicked
:param group_data: (df) group of (user, item, interaction)
:param item_freq: (series) index of item with its frequency
:param item_lookup: (list)
:param num_reco: (int) number of recommendations
:param user_col: (str)
:param item_col: (str)
:param item_name: (str)
:return:
rec_df: (df) Recommendation list for a customer
"""
num_clicked = cust_product_click.loc[customer_id]
clicked_items = list(group_data[group_data[user_col] == customer_id][item_col])
num_total = num_clicked + num_reco
num_item = item_freq.shape[0]
if num_total > num_item:
recommend_items = [x for x in item_freq.index.values if x not in clicked_items]
fill_num = num_reco - len(recommend_items)
fill_items = np.random.choice(a=item_freq.index.values, size=fill_num, replace=False, p=item_freq)
recommend_items.extend(fill_items)
ranked_items = item_freq[item_freq.index.isin(recommend_items)].sort_values().index.values[:num_reco]
else:
sample_items = np.random.choice(a=item_freq.index.values, size=num_total, replace=False, p=item_freq)
recommend_items = [x for x in sample_items if x not in clicked_items]
ranked_items = item_freq[item_freq.index.isin(recommend_items)].sort_values().index.values[:num_reco]
title_lookup = item_lookup.set_index(item_col)
rank_title = list(title_lookup.loc[ranked_items][item_name])
rec_df = pd.DataFrame({
'member_id': [customer_id for _ in range(len(ranked_items))],
'product_id': ranked_items,
'title': rank_title,
'rank': [x for x in range(len(ranked_items))]})
return rec_df
def get_similar_items(
item_id,
items,
item_vecs,
item_lookup,
item_col='product_id',
item_name='title'
):
"""
REF: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe
Get most similar items. The first most similar item will always be itself.
:param item_id: (int) product_id
:param items: (numpy array) array of product_id
:param item_vecs: (numpy array)
:param item_lookup: (df)
:param item_col: (str)
:param item_name: (str)
:return:
similar_df: (df)
"""
# Get the item row
item_row_index = np.where(items == item_id)[0][0]
item_vec = item_vecs[item_row_index].T
# Calculate the similarity score
# and select the top 10 most similar.
scores = item_vecs.dot(item_vec)
top_10_indices = np.argsort(scores)[::-1][:10]
similar_item_ids = []
similar_items = []
item_scores = []
# Get and print the actual item names and scores
for idx in top_10_indices:
similar_item_ids.append(item_lookup[item_col].loc[item_lookup[item_col] == items[idx]].iloc[0])
similar_items.append(item_lookup[item_name].loc[item_lookup[item_col] == items[idx]].iloc[0])
item_scores.append(scores[idx])
similar_df = pd.DataFrame({
'product_id': similar_item_ids,
'title': similar_items,
'score': item_scores})
return similar_df
def item_similarity(items, item_interaction_sparse):
"""
REF: https://medium.com/@wwwbbb8510/comparison-of-user-based-and-item-based-collaborative-filtering-f58a1c8a3f1d
REF: http://kevingo.logdown.com/posts/245762-item-based-collaborative-filtering-recommendation-method
Return symmetric matrix for item similarity based on cosine similarity.
:param items: (numpy array) item_ids
:param item_interaction_sparse: (csr) (row: item, col: user) sparse matrix
:return:
item_similar: (numpy array)
"""
# Identity matrix.
item_num = len(items)
item_similar = np.identity(item_num)
# Fill in the blank.
for i in tqdm_notebook(range(item_num)):
for j in range(item_num):
if j > i:
indices_i = set(item_interaction_sparse[i, :].nonzero()[1])
indices_j = set(item_interaction_sparse[j, :].nonzero()[1])
# Non-zero entries at both items.
indices_intersect = list(indices_i.intersection(indices_j))
if len(indices_intersect) == 0:
# print('{},{} items no intersected values...'.format(i,j))
item_similar[i, j] = 0 # if no interacted values, return 0
item_similar[j, i] = 0
else:
data_i = item_interaction_sparse[i, indices_intersect].toarray()[0]
data_j = item_interaction_sparse[j, indices_intersect].toarray()[0]
sim_score = 1 - spatial.distance.cosine(data_i, data_j)
item_similar[i, j] = sim_score
item_similar[j, i] = sim_score
return item_similar
def fill_unrate(user_interaction_sparse, users, items, item_similar):
"""
REF: https://medium.com/@wwwbbb8510/comparison-of-user-based-and-item-based-collaborative-filtering-f58a1c8a3f1d
REF: http://kevingo.logdown.com/posts/245762-item-based-collaborative-filtering-recommendation-method
Fill in (row: user, col: item) interaction matrix with predicted values on missing ratings.
:param user_interaction_sparse: (csr)
:param users: (numpy array) user_ids
:param items: (numpy array) item_ids
:param item_similar: (numpy array) item similarity matrix
:return:
user_interaction_spare_filled: (csr) filled with predicted rating
"""
user_interaction_sparse_filled = copy.deepcopy(user_interaction_sparse).toarray() # to ndarray for efficiency
users_num = len(users)
items_num = len(items)
item_indices_set = set(list(range(items_num)))
for member_index in tqdm_notebook(range(users_num)):
zero_indices = list(
item_indices_set.difference(
set(user_interaction_sparse[member_index, :].nonzero()[1])))
for unrate_index in zero_indices:
rated_indices = user_interaction_sparse[member_index, :].nonzero()[1]
S = item_similar[unrate_index, rated_indices]
abs_S = list(map(abs, S))
R = user_interaction_sparse[member_index, rated_indices].toarray()[0]
numerator = np.sum(np.multiply(R, S))
denominator = np.sum(abs_S)
if denominator == 0:
user_interaction_sparse_filled[
member_index, unrate_index] = numerator / 1.
else:
user_interaction_sparse_filled[
member_index, unrate_index] = numerator / denominator
return sparse.csr_matrix(user_interaction_sparse_filled)
def item_based_predict_at_k(filled_ratings, true_ratings, k=5, user_index=None):
"""
Compute the percentage of top k items predicted by item-based matrix are actually included in
original non-zero ratings entries.
REF: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
:param filled_ratings: (csr) item-based matrix filled with predicted values
:param true_ratings: (csr) interaction matrix
:param k: (int) top k preferable items
:param user_index: (list) index of matrix-split users
:return:
precisions: (float) mean top k precision for all users in user_index
"""
if not user_index:
user_index = range(true_ratings.shape[0])
precisions = []
for user in user_index:
# In case of large dataset, compute predictions row-by-row like below
predictions = filled_ratings[user, :].toarray()[0] # preference prediction for all items
top_k = np.argsort(-predictions)[:k] # top k item index
labels = true_ratings.getrow(user).indices # non-zero index
precision = float(len(set(top_k) & set(labels))) / float(k)
precisions.append(precision)
return np.mean(precisions)
def item_based_auc_mean(filled_ratings, train, test=None, user_index=None, mode='train'):
"""
Compute mean accuracy.
In train mode, only entries in train are used, while in test mode,
entries excluding non-zero train entries are used.
:param filled_ratings: (csr) item-based matrix filled with predicted values
:param train: (csr)
:param test: (csr)
:param user_index: (list) index of matrix-split users
:param mode: (string) train/test mode
:return:
auc: (float) mean of accuracy for all users in user_index
"""
if not user_index:
user_index = range(train.shape[0])
auc = []
for user in user_index:
preds = filled_ratings[user, :].toarray()[0] # preference prediction for all items
if mode == 'train':
col_indices = list(range(train.shape[1])) # entries from train matrix
actual = train[user, col_indices].toarray()[0] # list of list
actual[actual > 0] = 1
pred = preds[col_indices]
elif mode == 'test':
col_indices = set(range(train.shape[1])).difference(
set(train.getrow(user).indices)) # entries excluding train matrix
col_indices = list(col_indices)
actual = test[user, col_indices].toarray()[0] # list of list
actual[actual > 0] = 1
pred = preds[col_indices]
fpr, tpr, thresholds = metrics.roc_curve(actual, pred)
auc.append(metrics.auc(fpr, tpr))
return np.mean(auc)
def item_based_rec_items(
customer_id,
exclude_mf,
filled_ratings,
users,
items,
item_lookup,
num_items=10,
item_col='product_id',
item_name='title'
):
"""
REF: https://jessesw.com/Rec-System/
This function will return the top recommended items to our users
@parameters:
customer_id: (int) Input the customer's id number that you want to get recommendations for
exclude_mf: (csr) The interaction matrix you use to exclude already purchased/interacted items
filled_ratings: (csr) item-based matrix filled with predicted values
users: (numpy array) an array of the customer's ID numbers that make up the rows of your ratings matrix
(in order of matrix)
items: (numpy array) an array of the products that make up the columns of your ratings matrix
(in order of matrix)
item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
num_items: (int) The number of items you want to recommend in order of best recommendations. Default is 10.
item_col: (str)
item_name: (str)
@returns:
The top n recommendations chosen based on the user/item vectors for items never interacted with
"""
cust_ind = np.where(users == customer_id)[0][
0] # Returns the index row of our customer id
pref_vec = exclude_mf[cust_ind, :].toarray(
) # Get the ratings from the ratings matrix
pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
rec_vector = filled_ratings[cust_ind, :].toarray()
# Scale this recommendation vector between 0 and 1
min_max = MinMaxScaler()
rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0]
# Items already purchased have their recommendation multiplied by zero
recommend_vector = pref_vec * rec_vector_scaled
product_idx = np.argsort(
recommend_vector
)[::-1][:num_items] # Sort the indices of the items into order
# of best recommendations
rec_list = [] # start empty list to store items
for index in product_idx:
code = items[index]
rec_list.append([
code, item_lookup[item_name].loc[item_lookup[item_col] == code].iloc[0]
])
# Append our descriptions to the list
codes = [item[0] for item in rec_list]
descriptions = [item[1] for item in rec_list]
final_frame = pd.DataFrame({
'product_id': codes,
'title': descriptions
}) # Create a dataframe
return final_frame[['product_id', 'title']] # Switch order of columns around