-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgroup_creation.py
156 lines (114 loc) · 5.64 KB
/
group_creation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
'''
# Group Creation based on "A Novel Group Recommender System" by Reza Barzegar Nozari and Hamidreza Koohi
This Python script implements a group creation method inspired by the paper "A Novel Group Recommender System Based
on Members’ Influence and Leader Impact" by Reza Barzegar Nozari and Hamidreza Koohi. The method uses
Fuzzy C-Means Clustering and Pearson Correlation Coefficient (PCC) to select a group of similar users with
shared preferences for items in a user-item dataset.
## Features
- Reads a user-item dataset in CSV format.
- Creates a user-item matrix to represent user-item interactions.
- Normalizes the user-item matrix using Min-Max scaling.
- Performs Fuzzy C-Means Clustering on the normalized matrix to group users.
- Filters users by a specified cluster label.
- Calculates PCC between a randomly selected user and other users in the same cluster.
- Selects the top similar users based on PCC values.
- Outputs and saves the selected group of similar users to a CSV file.
## Parameters
- dataset_path: Path to your user-item dataset in CSV format.
- target_cluster: The cluster label from which to select similar users.
- group_size: The number of similar users to select for the group.
- c: Number of clusters for Fuzzy C-Means Clustering (default is 3).
- m: Fuzzy exponent for FCM clustering (default is 80).
- max_iter: Maximum number of iterations for FCM clustering (default is 1000).
Feel free to adjust these parameters based on your dataset and requirements.
'''
import numpy as np
import pandas as pd
import skfuzzy as fuzz
from scipy.stats import pearsonr
def read_dataset(dataset_path):
# Read the dataset from the given path
return pd.read_csv(dataset_path)
def create_user_item_matrix(df):
# Extract unique user and movie IDs
movies_id = np.sort(df["movieId"].unique())
users_id = np.sort(df["userId"].unique())
# Create the user-item matrix (UIM)
UIM = pd.DataFrame(np.zeros((len(users_id), len(movies_id))))
UIM = UIM.set_axis(movies_id, axis='columns')
UIM = UIM.set_axis(users_id, axis='rows')
# Fill the user-item matrix with ratings
for index, row in df.iterrows():
u = row['userId']
m = row['movieId']
s = row['rating']
UIM.at[u, m] = s
return UIM
def normalize_user_item_matrix(UIM):
# Normalize the matrix using Min-Max scaling
min_rating = np.min(UIM)
max_rating = np.max(UIM)
return (UIM - min_rating) / (max_rating - min_rating)
def perform_fuzzy_cmeans_clustering(normalized_matrix, c, m, max_iter):
# FCM clustering
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
data=normalized_matrix.T, c=c, m=m, error=0.01, maxiter=max_iter, init=None
)
# Assign cluster labels to users
cluster_membership = np.argmax(u, axis=0)
return cluster_membership
def filter_users_by_cluster(UIM, cluster_membership, target_cluster):
# Combine the user-item matrix with cluster labels
cluster_labaled_UIM = pd.concat([UIM, pd.DataFrame(cluster_membership, index=UIM.index, columns=["Cluster Label"])], axis=1)
# Filter users from the target cluster
cluster_data = cluster_labaled_UIM[cluster_labaled_UIM["Cluster Label"] == target_cluster]
# Delete the "Cluster Label" column
del cluster_data["Cluster Label"]
return cluster_data
def calculate_pcc(user1_ratings, user2_ratings):
# Check if both users have rated at least one common item
common_items = np.intersect1d(user1_ratings.columns, user2_ratings.columns)
if len(common_items) > 0:
return pearsonr(np.array(user1_ratings)[0], np.array(user2_ratings)[0])[0]
return None
def select_top_similar_users(cluster_data, group_size):
# Select a random user from the cluster
random_user = np.random.choice(cluster_data.index.unique())
# Calculate PCC with the randomly selected user for all users in the same cluster
pcc_values = []
for user_id in cluster_data.index.unique():
if user_id != random_user:
user1_ratings = cluster_data[cluster_data.index == random_user]
user2_ratings = cluster_data[cluster_data.index == user_id]
pcc = calculate_pcc(user1_ratings, user2_ratings)
if pcc is not None:
pcc_values.append((user_id, pcc))
# Sort users by PCC values in descending order
pcc_values.sort(key=lambda x: x[1], reverse=True)
# Extract the user IDs of the top similar users
similar_user_ids = [user_id for user_id, _ in pcc_values][:group_size]
# Create a new DataFrame containing data of the top similar users as a group
group_data = cluster_data[cluster_data.index.isin(similar_user_ids)]
return group_data
# Main function to orchestrate the entire process
def main(dataset_path, target_cluster, group_size, c=3, m=80, max_iter=1000):
# Step 1: Read the dataset
df = read_dataset(dataset_path)
# Step 2: Create the user-item matrix
UIM = create_user_item_matrix(df)
# Step 3: Normalize the user-item matrix
normalized_matrix = normalize_user_item_matrix(UIM)
# Step 4: Perform Fuzzy C-Means Clustering
cluster_membership = perform_fuzzy_cmeans_clustering(normalized_matrix, c, m, max_iter)
# Step 5: Filter users by cluster
cluster_data = filter_users_by_cluster(UIM, cluster_membership, target_cluster)
# Step 6: Select top similar users as a group
similar_users_group = select_top_similar_users(cluster_data, group_size)
print(similar_users_group)
similar_users_group.to_csv('Groups/Group_data.csv')
# Example usage:
if __name__ == "__main__":
dataset_path = 'data/Data.csv'
target_cluster = 1
group_size = 5
main(dataset_path, target_cluster, group_size)