-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathretail2.py
409 lines (314 loc) · 15.6 KB
/
retail2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# -*- coding: utf-8 -*-
"""
Created on Sat May 26 16:50:08 2018
@author: PUNEETMATHUR Copyright 2018 All rights reserved
DO NOT COPY WTTHOUT PERMISSION
"""
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validation")
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################
# Importing libraries
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import pandas.plotting
####################################################
####################################################
######### FUNCTIONS FOR DATA VISUALIZATION #########
####################################################
####################################################
def pcaOutput(good_data, pca):
'''
Visualizing the PCA results and calculating explained variance
'''
# I am doing Dimension indexing through pca components
dims = dims = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
# Creating the PCA components pandas dataframe from the dimensions
comps = pd.DataFrame(np.round(pca.components_, 4), columns = good_data.keys())
comps.index = dims
# Calculating PCA explained variance for each component
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = dims
# Creating a bar plot visualization for better understanding
fig, ax = plt.subplots(figsize = (24,12))
# Plotting the feature weights as a function of the components
comps.plot(ax = ax, kind = 'bar');
ax.set_ylabel("Feature Weights")
ax.set_xticklabels(dims, rotation=0)
# Displaying the explained variance ratios
for i, ev in enumerate(pca.explained_variance_ratio_):
ax.text(i-0.40, ax.get_ylim()[1] + 0.05, "Explained Variance\n %.4f"%(ev))
# Returning back a concatenated DataFrame
return pd.concat([variance_ratios, comps], axis = 1)
def clusterOutput(redData, preds, centers, pca_samples):
'''
Visualizes the PCA-reduced cluster data in two dimensions
Adds points for cluster centers for-selected sample data
'''
preds = pd.DataFrame(preds, columns = ['Cluster'])
plot_data = pd.concat([preds, redData], axis = 1)
# I am Generating the cluster plot
fig, ax = plt.subplots(figsize = (14,8))
# Creating the Color map to distinguish between clusters
cmap = cm.get_cmap('gist_rainbow')
# Coloring the points based on assigned cluster
for i, cluster in plot_data.groupby('Cluster'):
cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
# Plotting the centers with cross mark indicators
for i, c in enumerate(centers):
ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
alpha = 1, linewidth = 2, marker = 'o', s=200);
ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
# Plotting transformed sample points
ax.scatter(x = pca_samples[:,0], y = pca_samples[:,1], \
s = 150, linewidth = 4, color = 'black', marker = 'x');
# Plot title
ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids with Numerical markingr\nTick marks denote Transformed Sample Data");
def biPlotter(cleanData, redData, pca):
'''
Building a biplot for PCA of the reduced data and the projections of the original features.
Variable cleanData: original data, before transformation.
Creating a pandas dataframe with valid column names
redData: the reduced data (the first two dimensions are plotted)
pca: pca object that contains the components_ attribute
This function returns: a matplotlib AxesSubplot object (for any additional customization)
This function is inspired by the script by Teddy Roland on Biplot in Python:
https://github.com/teddyroland/python-biplot
'''
fig, ax = plt.subplots(figsize = (14,8))
# scatterplot of the reduced data
ax.scatter(x=redData.loc[:, 'Dimension 1'], y=redData.loc[:, 'Dimension 2'],
facecolors='b', edgecolors='b', s=70, alpha=0.5)
feature_vectors = pca.components_.T
# we use scaling factors to make the arrows easier to see
arrow_size, text_pos = 7.0, 8.0,
# projections of the original features
for i, v in enumerate(feature_vectors):
ax.arrow(0, 0, arrow_size*v[0], arrow_size*v[1],
head_width=0.2, head_length=0.2, linewidth=2, color='red')
ax.text(v[0]*text_pos, v[1]*text_pos, cleanData.columns[i], color='black',
ha='center', va='center', fontsize=18)
ax.set_xlabel("Dimension 1", fontsize=14)
ax.set_ylabel("Dimension 2", fontsize=14)
ax.set_title("Principal Component plane with original feature projections.", fontsize=16);
return ax
def channelOutput(redData, outliers, pca_samples):
'''
Here we are Visualizing the PCA-reduced cluster data in two dimensions using the full dataset
Data is labeled by "Channel" points added for selected sample data
'''
# Check that the dataset is loadable
try:
full_data = pd.read_csv("E:/retail2.csv")
except:
print "Dataset could not be loaded. Is the file missing?"
return False
# Create the Channel DataFrame
chl = pd.DataFrame(full_data['Channel'], columns = ['Channel'])
chl = chl.drop(chl.index[outliers]).reset_index(drop = True)
labeled = pd.concat([redData, chl], axis = 1)
# Generate the cluster plot
fig, ax = plt.subplots(figsize = (14,8))
# Color map
cmap = cm.get_cmap('gist_rainbow')
# Color the points based on assigned Channel
labels = ['Segment 1/Segment 2/Segment3', 'Retail Customer']
grouped = labeled.groupby('Channel')
for i, chl in grouped:
chl.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
color = cmap((i-1)*1.0/2), label = labels[i-1], s=30);
# Plot transformed sample points
for i, sample in enumerate(pca_samples):
ax.scatter(x = sample[0], y = sample[1], \
s = 200, linewidth = 3, color = 'black', marker = 'o', facecolors = 'none');
ax.scatter(x = sample[0]+0.25, y = sample[1]+0.3, marker='$%d$'%(i), alpha = 1, s=125);
# Set plot title
ax.set_title("PCA-Reduced Data Labeled by 'Channel'\nTransformed Sample Data Circled");
###########################################################
###########################################################
######### END OF FUNCTIONS FOR DATA VISUALIZATION #########
###########################################################
###########################################################
##########################################################################
##########################################################################
####################### IMPLEMENTATION CODE #############################
##########################################################################
##########################################################################
###########################################
# I am Suppressing matplotlib and other module deprecation user warnings
# This is Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
warnings.filterwarnings("ignore", category = UserWarning, module = "cross_validationb")
warnings.filterwarnings("ignore", category=DeprecationWarning)
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
# Load the retail customers dataset
try:
data = pd.read_csv("E:\\retail2.csv")
data.drop(['Region', 'Channel'], axis = 1, inplace = True)
print "Retail customers dataset has {} samples with {} features each.".format(*data.shape)
except:
print "Dataset could not be loaded. Is the dataset missing?"
# Display a description of the dataset
display(data.describe())
# Selecting three indices of my choice from the dataset
indices = [225,182,400]
# Creating a DataFrame of the chosen samples
samples = pd.DataFrame(data.loc[indices], columns = data.keys()).reset_index(drop = True)
print "Chosen samples of Retail customers dataset:"
display(samples)
#lOOKING AT THE PLOTS TO CHECK THE TREND
pd.DataFrame(samples).transpose().plot(kind='bar', stacked=False, figsize=(8,8), title='Retail customers dataset')
# Calculating deviations from Mean and Median
#to check how much the 3 samples deviate from the Center.
delta_mean = samples - data.mean().round()
delta_median = samples - data.median().round()
#display(delta_mean, delta_median)
delta_mean.plot.bar(figsize=(30,10), title='Compared to MEAN', grid=True)
delta_median.plot.bar(figsize=(30,10), title='Compared to MEDIAN', grid=True);
#Importing Libraries
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
#Broke down the logic into a scoring function and a For loop
def scorer(feature):
# Make a copy of the DataFrame, using the 'drop' function to drop the given feature
relevant_data = data.drop([feature], axis=1)
# Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(relevant_data, data[feature], test_size=0.25, random_state=45)
# Create a decision tree regressor and fit it to the training set
regressor = DecisionTreeRegressor(random_state=45).fit(X_train, y_train)
# Report the score of the prediction using the testing set
score = regressor.score(X_test, y_test)
print("The R2 score for feature {:16} is {:+.5f}".format(feature, score))
for feature in data.columns.values:
scorer(feature)
# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(data, alpha = 0.3, figsize = (24,12), diagonal = 'kde');
#checking Correlation table
data.corr()
# Scale the data using the natural logarithm
log_data = np.log(data)
# Scale the sample data using the natural logarithm
log_samples = np.log(samples)
# Display the log-transformed sample data
display(log_samples)
log_samples.corr()
#Identifying Outliers
#Using Counters
from collections import Counter
outliers_counter = Counter()
# For each feature finding out the data points with extreme high or low values - Outliers
outliers_scores = None
for feature in log_data.keys():
# Calculate Q1 (25th percentile of the data) for the given feature
Q1 = np.percentile(log_data[feature], 25)
# Calculate Q3 (75th percentile of the data) for the given feature
Q3 = np.percentile(log_data[feature], 75)
# Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = 1.5 * (Q3 - Q1)
empty = np.zeros(len(log_data[feature]))
aboveQ3 = log_data[feature].values - Q3 - step
belowQ3 = log_data[feature].values - Q1 + step
current_outliers_scores = np.array(np.maximum(empty, aboveQ3) - np.minimum(empty, belowQ3)).reshape([-1,1])
outliers_scores = current_outliers_scores if outliers_scores is None else np.hstack([outliers_scores, current_outliers_scores])
# Display the outliers
print("Data points considered outliers for the feature '{}':".format(feature))
current_outliers = log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))]
display(current_outliers)
outliers_counter.update(current_outliers.index.values)
# Select the indices for data points you wish to remove
min_outliers_count = 2
outliers = [x[0] for x in outliers_counter.items() if x[1] >= min_outliers_count]
print("Data points considered outlier for more than 1 feature: {}".format(outliers))
# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)
# Apply PCA by fitting the good data with the same number of dimensions as features
from sklearn.decomposition import PCA
pca = PCA(n_components=len(good_data.columns)).fit(good_data)
# Transform log_samples using the PCA fit above
pca_samples =pca.transform(log_samples)
# Generate PCA results plot
pcaOutput = pcaOutput(good_data, pca)
# show results
display(pcaOutput)
# Cumulative explained variance should add to 1
display(pcaOutput['Explained Variance'].cumsum())
from sklearn.decomposition import PCA
# Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2).fit(good_data)
# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)
# Transform the sample log-data using the PCA fit above
pca_samples = pca.transform(log_samples)
# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])
# Display sample log-data after applying PCA transformation in two dimensions
display(pd.DataFrame(np.round(pca_samples, 4), columns = ['Dimension 1', 'Dimension 2']))
# Create a biplot
biPlotter(good_data, reduced_data, pca)
#Import GMM and silhouette score library
from sklearn.mixture import GMM
from sklearn.metrics import silhouette_score
#Divided the logic into Clustering and Scoring functions
#Then Iterated through the clusters in the identified range
#This function creates Cluster using GMM
def clusterCreator(data, n_clusters):
clusterer = GMM(n_components=n_clusters, covariance_type='full', random_state=45).fit(data)
preds = clusterer.predict(data)
centers = clusterer.means_
sample_preds = clusterer.predict(pca_samples)
return preds, centers, sample_preds
#Scoring after creating Clusters
def scorer(data, n_clusters):
preds, _, _ = clusterCreator(data, n_clusters)
score = silhouette_score(data, preds)
return score
#Iterate in the clusters and Print silhouette score
for n_clusters in range(2,10):
score = scorer(reduced_data, n_clusters)
print "For n_clusters = {}. The average silhouette_score is : {}".format(n_clusters, score)
n_clusters=0
# Resetting as due to loop run before we need to reset values again with 2 cluster components
clusterer = GMM(n_components=2).fit(reduced_data)
predictions = clusterer.predict(reduced_data)
centers = clusterer.means_
sample_preds = clusterer.predict(pca_samples)
# Display the results of the clustering from implementation
clusterOutput(reduced_data, predictions, centers, pca_samples)
log_centers = pca.inverse_transform(centers)
# Exponentiate the centers
true_centers = np.exp(log_centers)
# Display the true centers
segments = ['Segment {}'.format(i) for i in range(0,len(centers))]
true_centers = pd.DataFrame(np.round(true_centers), columns = data.keys())
true_centers.index = segments
display(true_centers)
#Compare the True Centers from the Central values Mean and Median
#display(true_centers - data.mean().round())
delta_mean=true_centers - data.mean().round()
#display(true_centers - data.median().round())
delta_median=true_centers - data.median().round()
delta_mean.plot.bar(figsize=(10,4), title='Compared to MEAN', grid=True)
delta_median.plot.bar(figsize=(10,4), title='Compared to MEDIAN', grid=True);
# Display the predictions
for i, pred in enumerate(sample_preds):
print "Sample point", i, "predicted to be in Cluster", pred
samples
# Display the clustering results based on 'Channel' data
redData=reduced_data
channelOutput(redData, outliers, pca_samples)
##########################################################################
##########################################################################
####################### END OF CODE #####################################
##########################################################################
##########################################################################