-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathElectronicPatientRecord.py
194 lines (160 loc) · 6.06 KB
/
ElectronicPatientRecord.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 15 23:46:06 2018
@author: PUNEETMATHUR
"""
#Importing python libraries
import pandas as pd
from io import StringIO
import requests
import os
os.getcwd()
#Reading dataset from flat file
fname="Diabetes_Dataset.csv"
patients= pd.read_csv(fname, low_memory=False, index_col=False)
df= pd.DataFrame(patients)
#Look at the first record
print(df.head(1))
#Check the shape size and columns in the dataset
print(df.size)
print(df.shape)
print(df.columns)
df.dtypes
#Check if there are any columns with empty/null dataset
df.isnull().any()
#Checking how many columns have null values
df.info()
dfworking=df.drop('Patient ID',axis=1)
#You can use Describe method to see however since our columns are more
#We will use individual functions to do EDA
print(dfworking.describe)
#Using individual functions to do EDA
#Checking out Statistical data Mean Median Mode correlation
dfworking.mean()
dfworking.median()
dfworking.mode()
#How is the data distributed and detecting Outliers
dfworking.std()
dfworking.max()
dfworking.min()
dfworking.quantile(0.25)*1.5
dfworking.quantile(0.75)*1.5
#How many Outliers in the BPSystolic column
df.columns
df.dtypes
dfworking.set_index(['BPSystolic'])
dfworking['BPSystolic'].loc[df['BPSystolic'] >=183.562500].count()
dfworking.set_index(['Patient ID'])
dfworking.boxplot(figsize=(10, 6))
dfworking.plot.box(vert=False)
dfworking.boxplot(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6))
dfworking.kurtosis()
dfworking.skew()
import scipy.stats as sp
sp.skew(dfworking.A1cTEST)
#Visualizing dataset
dfworking.plot()
dfworking.hist(column=['A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI','Age'],figsize=(10, 6))
dfworking.plot.area()
dfworking.plot.area(stacked=False)
#Now look at correlation and patterns
dfworking.corr()
dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['A1cTEST']*2)
dfworking.plot.scatter(x='A1cTEST', y='BPSystolic',s=dfworking['BPSystolic']*0.13)
dfworking.plot.hexbin(x='A1cTEST', y='BPSystolic', gridsize=25)
#Look at crosstabulation to conclude EDA
df.columns
#Counting the Categorical variables
my_tab = pd.crosstab(index=df["Gender"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["Type of diabetes"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["Diabetes status"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["FrozenShoulder"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["CarpalTunnelSynd"], columns="Count") # Name the count column
my_tab = pd.crosstab(index=df["DuputrensCont"], columns="Count") # Name the count column
print(my_tab)
my_tab=my_tab.sort_values('Count', ascending=[False])
print(my_tab)
my_tab.sum()
data_counts = pd.DataFrame(my_tab)
pd.DataFrame(data_counts).transpose().plot(kind='bar', stacked=False)
#Data Preparation Steps
#Step 1 Split data into features and target variable
# Split the data into features and target label
diabetics = pd.DataFrame(dfworking['Diabetes status'])
features = pd.DataFrame(dfworking.drop('Diabetes status', axis = 1))
diabetics.columns
features.columns
#Step 2 Standardize dataset
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
dfworking.dtypes
numerical = ['Age','A1cTEST','BPSystolic','BPDiastolic','HeartRateBPM','BMI']
features_raw[numerical] = scaler.fit_transform(dfworking[numerical])
# Show an example of a record with scaling applied
display(features_raw[numerical].head(n = 1))
# Step 3 One-hot encode the 'features_raw' data using pandas.get_dummies()
features = pd.get_dummies(features_raw)
features.columns
#Checking output
display(features.head(1),diabetics.head(1))
# Print the number of features after one-hot encoding
encoded = list(features.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))
# see the encoded feature names
print(encoded)
#Step 4 Shuffle & Split Final Dataset
# Import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
# Shuffle and split the data into training and testing subsets
features=shuffle(features, random_state=0)
diabetics=shuffle(diabetics, random_state=0)
# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, diabetics, test_size = 0.2, random_state = 0)
# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))
#Model Building & Evaluation
#Creating the the Model for prediction
#Loading model Libraries
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# prepare models
seed = 7
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
import warnings
warnings.filterwarnings("ignore")
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
import matplotlib.pyplot as plt
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()