-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathCaseStudy2Chapter4.py
162 lines (124 loc) · 4.16 KB
/
CaseStudy2Chapter4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 17 13:16:52 2018
@author: PMAUTHOR
"""
import pandas as pd
from io import StringIO
import requests
import os
os.getcwd()
fname="Food_Raw_Data.csv"
hospitals= pd.read_csv(fname, low_memory=False, index_col=False)
df= pd.DataFrame(hospitals)
print(df.head(1))
print(df.size)
print(df.shape)
print(df.columns)
df.dtypes
#Check if there are any columns with empty/null dataset
df.isnull().any()
#Checking how many columns have null values
df.info()
#Using individual functions to do EDA
#Checking out Statistical data Mean Median Mode correlation
df.mean()
df.median()
df.mode()
#How is the data distributed and detecting Outliers
df.std()
df.max()
df.min()
df.quantile(0.25)*1.5
df.quantile(0.75)*1.5
#How many Outliers in the Total Food ordered column
df.columns
df.dtypes
df.set_index(['Hospital Name'])
df['Total Food ordered'].loc[df['Total Food ordered'] <=238.5].count()
df['Total Food ordered'].loc[df['Total Food ordered'] >=679.5].count()
#Visualizing the dataset
df.boxplot(figsize=(10, 6))
df.plot.box(vert=False)
df.kurtosis()
df.skew()
import scipy.stats as sp
sp.skew(df['Total Food ordered'])
#Visualizing dataset
df.plot()
df.hist(figsize=(10, 6))
df.plot.area()
df.plot.area(stacked=False)
#Now look at correlation and patterns
df.corr()
#Change to dataset columns
df.plot.scatter(x='Total Food Wasted', y='No of Guests with Inpatient',s=df['Total Food Wasted']*2)
df.plot.hexbin(x='Total Food Wasted', y='No of Guests with Inpatient', gridsize=25)
#Change to dataset columns
#Look at crosstabulation to conclude EDA
df.columns
df.dtypes
#Counting the Categorical variables
my_tab = pd.crosstab(index=df["Feedback"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["Type of Hospital"], columns="Count") # Name the count column
print(my_tab)
my_tab=my_tab.sort_values('Count', ascending=[False])
print(my_tab)
#my_tab.sum()
data_counts = pd.DataFrame(my_tab)
pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False)
#Data Preparation Steps
#Step 1 Split data into features and target variable
# Split the data into features and target label
wastage = pd.DataFrame(df['Total Food Wasted'])
dropp=df[['Total Food Wasted','Feedback','Type of Hospital','Total No of beds']]
features= df.drop(dropp, axis=1)
wastage.columns
features.columns
#Step 2 Shuffle & Split Final Dataset
# Import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
# Shuffle and split the data into training and testing subsets
features=shuffle(features, random_state=0)
wastage=shuffle(wastage, random_state=0)
# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, wastage, test_size = 0.2, random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
#Model Building & Evaluation
#Creating the the Model for prediction
#Loading model Libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import LinearSVC
#Creating Linear Regression object
regr = linear_model.LinearRegression()
linear_svm = LinearSVC().fit(X_train,y_train)
regr.fit(X_train,y_train)
y_pred= regr.predict(X_test)
yy_pred= linear_svm.predict(X_test)
#Printing Codfficients
print('Coefficients: \n',regr.coef_)
print(LinearSVC().fit(X_train,y_train).coef_)
regr.score(X_train,y_train)
#Mean squared error
print("mean squared error: %.2f" %mean_squared_error(y_test,y_pred))
#Variance score
print("Variance score: %2f" % r2_score(y_test, y_pred))
#Plot and visualize the Linear Regression plot
plt.plot(X_test, y_pred, linewidth=3)
plt.show()
#Checking graphically the boundaries formed by Linear SVM
line = np.linspace(-15, 15)
for coef, intercept in zip(linear_svm.coef_, linear_svm.intercept_):
plt.plot(line, -(line * coef[0] + intercept) / coef[1]) #HOW DO WE KNOW
plt.ylim(-10, 15)
plt.xlim(-10, 8)
plt.show()
predicted= regr.predict([[820,81,363,35]])
print(predicted)
features.head(2)