-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathAccounting_Model_Final.py
402 lines (326 loc) · 14.3 KB
/
Accounting_Model_Final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 11 14:03:51 2018
@author: PUNEETMATHUR
"""
#Importing python libraries
import pandas as pd
from io import StringIO
import requests
import os
import numpy as np
os.getcwd()
#Creating Visualization Functions which will be used throughout this model
import matplotlib.pyplot as pl
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score, accuracy_score
def distribution(data, transformed = False):
"""
Visualization code for displaying skewed distributions of features
"""
# Create figure
fig = pl.figure(figsize = (11,5));
# Skewed feature plotting
for i, feature in enumerate(['Amount','Month', 'Fiscal Year']):
ax = fig.add_subplot(1, 3, i+1)
ax.hist(data[feature], bins = 25, color = '#00A0A0')
ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
ax.set_xlabel("Value")
ax.set_ylabel("Number of Records")
ax.set_ylim((0, 2000))
ax.set_yticks([0, 500, 1000, 1500, 2000])
ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])
# Plot aesthetics
if transformed:
fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
fontsize = 16, y = 1.03)
else:
fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
fontsize = 16, y = 1.03)
fig.tight_layout()
fig.show()
#End of Distribution Visualization function
# Plotting Feature Importances through this function
def feature_plot(importances, X_train, y_train):
# Display the five most important features
indices = np.argsort(importances)[::-1]
columns = X_train.columns.values[indices[:5]]
values = importances[indices][:5]
# Creat the plot
fig = pl.figure(figsize = (9,5))
pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
pl.bar(np.arange(4), values, width = 0.6, align="center", color = '#00A000', \
label = "Feature Weight")
pl.bar(np.arange(4) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
label = "Cumulative Feature Weight")
pl.xticks(np.arange(5), columns)
pl.xlim((-0.5, 4.5))
pl.ylabel("Weight", fontsize = 12)
pl.xlabel("Feature", fontsize = 12)
pl.legend(loc = 'upper center')
pl.tight_layout()
pl.show()
#End of Feature Importances function
#Loading the Dataset
fname="C:/DATASETS/data.ct.gov/PaymentsDataset.csv"
openledger= pd.read_csv(fname, low_memory=False, index_col=False)
#Verify the data Loaded into memory
print(openledger.head(1))
#Loding into Dataframe for easier computation
data= pd.DataFrame(openledger)
#Look at the first record
print(data.head(1))
#Check the shape of columns in the dataset
print(data.shape)
print(data.columns)
data.dtypes
#Data Cleanup
#Check if there are any columns with empty/null dataset
data.isnull().any()
#No Data Cleanup needed so skipping decision to drop the na value rows since they are very less
#data=data.dropna()
data.isnull().sum()
#Total number of records
n_records = len(data.index)
#Number of records where payments are below 1.5 times of Lower Quantile- Lower Outlier Limit
l=data[data['RedFlag'] == 2].index
n_greater_quantile = len(l)
#Number of records where payments are above 1.5 times of upper Quantile- Upper Outlier Limit
l=data[data['RedFlag'] == 1].index
n_lower_quantile = len(l)
#Percentage of Payments above Upper Outlier limit
p=float(n_greater_quantile)/n_records*100.0
greater_percent =p
#Percentage of Payments above Lower Outlier limit
p=float(n_lower_quantile)/n_records*100.0
lower_percent =p
# Print the results
print "Total number of records: {}".format(n_records)
print "High value Payments above 1.5 times of 75th Percentile: {}".format(n_greater_quantile)
print "Low value Payments below 1.5 times of 25th Percentile: {}".format(n_lower_quantile)
print "Percentage of high value Payments: {:.2f}%".format(greater_percent)
print "Percentage of low value Payments: {:.2f}%".format(lower_percent)
# PREPARING DATA
# Split the data into features and target label
payment_raw = pd.DataFrame(data['RedFlag'])
type(payment_raw)
features_raw = data.drop('RedFlag', axis = 1)
#Removing redundant columns from features_raw dataset
features_raw.dtypes
features_raw=features_raw.drop('TransactionNo', axis=1)
features_raw=features_raw.drop('Department', axis=1)
features_raw=features_raw.drop('Account', axis=1)
features_raw=features_raw.drop('Expense Category', axis=1)
features_raw=features_raw.drop('Vendor ID', axis=1)
features_raw=features_raw.drop('Payment Method', axis=1)
features_raw=features_raw.drop('Payment Date', axis=1)
features_raw=features_raw.drop('Invoice ID', axis=1)
features_raw=features_raw.drop('Invoice Date', axis=1)
features_raw=features_raw.drop('Unnamed: 0', axis=1)
features_raw.dtypes
type(features_raw)
# Visualize skewed continuous features of original data
distribution(data)
# Log-transform the skewed features
#Replacing Null values with zero due to software data entry problem
#Known issue in software user screen takes null values there is no check.
import warnings
warnings.filterwarnings("ignore")
features_raw.isnull().sum()
skewed = ['Amount','Month', 'Fiscal Year']
features_raw[skewed] = data[skewed].apply(lambda x: np.log(x + 1))
features_raw.isnull().sum()
features_raw.fillna(0, inplace=True)
features_raw.dtypes
# Visualize the new log distributions
distribution(features_raw, transformed = True)
#Normalizing Numerical Features
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = [ 'Amount','Month', 'Fiscal Year']
#features_raw[numerical] = scaler.fit_transform(data[numerical])
features_raw[numerical] = scaler.fit_transform(features_raw[numerical])
distribution(features_raw)
# Look at record with scaling applied to see if everything is good
display(features_raw.head(n = 1))
features_raw.columns
#Implementation: Data Preprocessing
# One-Hot Encoding
data.columns
data.dtypes
data['Payment Status']
#data['Expense Category'] #drop from features_raw
#data['Payment Method'] #drop from features_raw
# Encoding the 'Non-Numeric' data to numerical values
#Payment Status column
d={"Paid-Reconciled": 0, "Paid-Unreconciled": 1}
features_raw['Payment Status'] = features_raw['Payment Status'].map(d)
# Printing the number of features after one-hot encoding
encoded = list(features_raw.columns)
print "{} total features after one-hot encoding.".format(len(encoded))
# Importing train_test_split
from sklearn.cross_validation import train_test_split
payment_raw.columns
# Splitting the 'features' and 'RedFlags' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_raw, payment_raw, test_size = 0.2, random_state = 0)
X_train.columns
X_test.columns
y_train.columns
y_test.columns
# Showing the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])
#Checking to see if there are any empty rows
X_train.isnull().any()
y_train.isnull().any()
X_train.isnull().sum()
#Evaluating Model Performance
#Establishing Benchmark performance indicator Naive Bayes
#Naive Predictor Performace
from sklearn.naive_bayes import GaussianNB
#from sklearn.metrics import accuracy_score, fbeta_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
NB = GaussianNB()
NB.fit(X_train,y_train)
pred = NB.predict(X_test)
#Calculating Accuracy Score
#Calculating Beta Score
accuracy_score(y_test,pred)
print(f1_score(y_test,pred, average="macro"))
print(precision_score(y_test, pred, average="macro"))
print(recall_score(y_test, pred, average="macro"))
# I AM GOING TO DO INITIAL MODEL EVALUATION NOW
# Importing the three supervised learning models from sklearn
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
# Initialize the three models
clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_features=0.2, max_depth=2, min_samples_split=2,random_state=0)
clf_C = LogisticRegression(random_state=0)
clf_D = SGDClassifier(loss="hinge", penalty="l2")
clf_E = ExtraTreesClassifier(n_estimators=2, max_depth=2,min_samples_split=2, random_state=0)
clf_F = RandomForestClassifier(max_depth=2)
# Calculate the number of samples for 1%, 10%, and 100% of the training data
#Defining function since percent is required 3 times
# Collect results on the learners
learners=["Naive Bayes","Decision Tree","Logistic Regression","SGD Classifier","ExtaTrees Classifier","RandomFores Classifier"]
cnt=0
columns=['learner','train_time','pred_time','acc_train','acc_test','f1_score']
learningresults= pd.DataFrame(columns=columns)
results = {}
for learner in [clf_A, clf_B, clf_C,clf_D,clf_E,clf_F]:
#print(learners[cnt])
results['learner']=learners[cnt]
#Fitting the learner to the training data using slicing with 'sample_size'
start = time() # Get start time
learner.fit(X_train, y_train)
#Calculating the total prediction time
end = time() # Get end time
results['train_time'] = end - start
start = time() # Get start time
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train)
end = time() # Get end time
results['pred_time'] = end - start
results['acc_train'] = accuracy_score(y_train, predictions_train)
results['acc_test'] = accuracy_score(y_test, predictions_test)
beta=0.5
results['f1_score'] = f1_score(y_test,pred, average="macro")
print(results)
learningresults.loc[cnt]=results
cnt=cnt+1
#Looking at the plots to determine the best Classifier for our Dataset
print(learningresults)
learningresults.columns
learningresults.plot(kind='bar', x='learner', legend='reverse', title='Classifier Algorithms Compared- Accounts Payment Dataset',figsize=(10,10), fontsize=20)
#learningresults.plot(kind='bar', x='learner').savefig('E:/BUSINESS/APRESS/ApplicationsOfMachineLearning/Chapter15/learner_performance.png')
#--------------------------MODEL TUNING ------------------------
#Now I will implement Model tuning
# Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from IPython.display import display
import pickle, os.path
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import fbeta_score
def getscore(y_true, y_predict):
return fbeta_score(y_true, y_predict, beta)
best_clf = None
beta=0.5
#Initialize the classifier
clf_C = LogisticRegression(random_state=0)
# Create the parameters list you wish to tune
#parameters = {'n_estimators':range(10,20),'criterion':['gini','entropy'],'max_depth':range(1,5)}
parameters = {'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'C':range(1,10),'max_iter':range(50,100)}
# Make an fbeta_score scoring object
scorer = make_scorer(getscore)
# Perform grid search on the classifier using 'scorer' as the scoring method
#grid_obj = GridSearchCV(clf_C, parameters, scoring=scorer)
#do something
grid_obj = GridSearchCV(clf_C, parameters)
#grid_obj = GridSearchCV(clf_C, parameters)
# Fit the grid search object to the training data and find the optimal parameters
from datetime import datetime
startTime = datetime.now()
grid_fit = grid_obj.fit(X_train, y_train)
CV_lr = GridSearchCV(estimator=clf_C, param_grid=parameters, cv= 5)
CV_lr.fit(X_train, y_train)
print(datetime.now() - startTime)
# Get the estimator
best_clf = grid_fit.best_estimator_
# Make predictions using the unoptimized and model
predictions = (clf_C.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)
# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5,average='micro'))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5,average='micro'))
# Print the final parameters
df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score').tail()
display(df)
print "Parameters for the optimal model: {}".format(clf.get_params())
# Now Extracting Feature Importances
# mporting a supervised learning model that has 'feature_importances_'
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import fbeta_score
# Training the supervised model on the training set
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
# TODO: Extract the feature importances
importances = model.feature_importances_
# Plot
feature_plot(importances, X_train, y_train)
# Feature Selection
# Import functionality for cloning a model
from sklearn.base import clone
best_clf= clf_F
# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]
# Train on the "best" model found from grid search earlier
clf = (clone(best_clf)).fit(X_train_reduced, y_train)
best_predictions = best_clf.predict(X_test)
# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)
# Report scores from the final model using both versions of data
print "Final Model trained on full data\n------"
print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, average="macro", beta = 0.5))
print "\nFinal Model trained on reduced data\n------"
print "Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta = 0.5, average='macro'))
# Print the final parameters
print "Parameters for the optimal model: {}".format(clf.get_params())