Skip to content

Commit

Permalink
Improved performance and new functionalities
Browse files Browse the repository at this point in the history
Code refactored
Initial live packet attack prediction
Improved Performance
2D and 3D data plots now available
Fixed code errors
Decreased code redundancy
Transformed dataframes into a python global class design pattern
  • Loading branch information
mohab-sameh authored May 19, 2021
1 parent 21fab9c commit 3cd674f
Showing 1 changed file with 151 additions and 98 deletions.
249 changes: 151 additions & 98 deletions Implementation/streamlit-demo/form-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import pandas as pd
import sklearn




import matplotlib.pyplot as plt
import time
from sklearn import datasets
Expand All @@ -20,11 +23,10 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.metrics import accuracy_score


import plotly.express as px



Expand All @@ -40,18 +42,19 @@
st.write("hahah omg")
"""
exec(program)
#exec(program)

class Dataframe:
ratio=0.3

ratio=0.33

#Defining visualization plot here
def plot_column( col ):
if df[col].dtype == 'object':
if Dataframe.df[col].dtype == 'object':
encoder=LabelEncoder()
df[col] = encoder.fit_transform(df[col])
Dataframe.df[col] = encoder.fit_transform(Dataframe.df[col])
fig, ax = plt.subplots()
df.hist(
Dataframe.df.hist(
bins=8,
column= col ,
grid=False,
Expand All @@ -67,15 +70,43 @@ def plot_column( col ):
#@st.cache(suppress_st_warning=True)
def write_statistics(statistics, visualizaitons):
if 'Dataset Shape' in statistics:
st.write('Shape of Dataset:', X.shape)
st.write('Shape of Dataset:', Dataframe.df.shape)
if 'Number of Classes' in statistics:
st.write('Number of Classes:', len(np.unique(y)))
st.write('Number of Classes:', len(np.unique(Dataframe.y)))
if 'Dataset Head' in statistics:
st.write('Dataset Head:', df.head(5))
st.write('Dataset Head:', Dataframe.df.head(5))
if 'Describe Features' in statistics:
st.write('Feature Description:', df.describe())
st.write('Feature Description:', Dataframe.df.describe())
if 'View Packet Types' in statistics:
st.write('Packet Types:', np.unique(y))
st.write('Packet Types:', np.unique(Dataframe.y))
if 'Scatter Plots' in statistics:
st.subheader("Scatter Plot:")
plot_dim = st.selectbox("Select plot dimensionality", ('2D Plot', '3D Plot'))
with st.form('Scatter Plot Form'):
if(Dataframe.df.shape[0] < 200):
max_rows = len(Dataframe.df.index)
else:
max_rows = 200
num_samples = st.slider(label="Select number of random samples", min_value=1, max_value=max_rows)
sample_df = Dataframe.df.sample(num_samples)

if plot_dim == '2D Plot':
feature_x = st.selectbox('Select X-Axis Feature', (Dataframe.df.columns))
feature_y = st.selectbox('Select Y-Axis Feature', (Dataframe.df.columns))
if(feature_x and feature_y):
fig = px.scatter(sample_df, x= feature_x, y = feature_y, color=sample_df.columns[len(sample_df.columns)-1])
st.plotly_chart(fig)
if plot_dim == '3D Plot':
feature_x = st.selectbox('Select X-Axis Feature', (Dataframe.df.columns))
feature_y = st.selectbox('Select Y-Axis Feature', (Dataframe.df.columns))
feature_z = st.selectbox('Select Z-Axis Feature', (Dataframe.df.columns))
if(feature_x and feature_y):
fig = px.scatter_3d(sample_df, feature_x, feature_y, feature_z, color = sample_df.columns[len(sample_df.columns)-1])
st.plotly_chart(fig)

scatter_submit = st.form_submit_button('Apply Selected Options')



if visualizaitons:
write_visualizations(visualizaitons)
Expand All @@ -86,95 +117,52 @@ def write_visualizations(visualizaitons):
plot_column(col=column)


def populate_statistics(df):
def populate_statistics():
st.sidebar.header('Data Exploration')
#Print statistics and visualizations sidebar items
statistics=False
visualizaitons=False
with st.sidebar.form('Statistics Form'):
statistics = st.multiselect(
'Select Desired Statistics',
('Dataset Head', 'Dataset Shape', 'Number of Classes', 'Describe Features', 'View Packet Types', 'Plot Feature Visualizations')
('Dataset Head', 'Dataset Shape', 'Number of Classes', 'Describe Features', 'View Packet Types', 'Scatter Plots', 'Plot Feature Visualizations')
)
statistics_submit = st.form_submit_button('Show Selected Options')

if 'Plot Feature Visualizations' in statistics:
with st.sidebar.form('Visualizations Form'):
visualizaitons = st.multiselect(
'Select Desired Visualizations',
(df.columns)
(Dataframe.df.columns)
)
visualizations_submit = st.form_submit_button('Show Selected Options')

if statistics:
write_statistics(statistics, visualizaitons)


def populate_preprocessors(df, X, y):
def populate_preprocessors():
st.sidebar.header('Preprocessing')

#Drop null values here:
drop_nulls_btn = st.sidebar.checkbox('Drop Rows with Null Values')
if drop_nulls_btn:
df = df.dropna(axis=0)
Dataframe.df = Dataframe.df.dropna(axis=0)

#Print preprocessing sidebar items
scaling_btn = st.sidebar.checkbox('Apply Logarithmic Scaling')
if scaling_btn:
#Applying logarithmic scaling here
sc = MinMaxScaler()
X = sc.fit_transform(X)
Dataframe.X[Dataframe.X.columns] = sc.fit_transform(Dataframe.X)

ratio_btn = st.sidebar.selectbox('Select Custom/Default Test-Train Ratio',('Default', 'Custom'))
global ratio
if ratio_btn == 'Default':
ratio = 0.33
Dataframe.ratio = 0.33
if ratio_btn == 'Custom':
ratio = st.sidebar.number_input('ratio', min_value=0.01, max_value=0.99)

Dataframe.ratio = st.sidebar.number_input('ratio', min_value=0.01, max_value=0.99)


return df, X, y


uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
#Reading uploading dataset csv file here
df = pd.read_csv(uploaded_file)

#Check if dataframe has NaN/Infinite values
#st.write(df.isna().any())
#st.write(df.isin([np.inf, -np.inf]).any())



#Replace NaN/Infinite values with 0
df = df.fillna(0)
df = df.replace([np.inf, -np.inf], 0)


#Splitting x & y dataframes here
y = df.iloc[:,[-1]]
X = df.iloc[: , :-1]



#Label encoding categorical features here
encoder = LabelEncoder()
num_cols = X._get_numeric_data().columns
cate_cols = list(set(X.columns)-set(num_cols))
for item in cate_cols:
X[item] = encoder.fit_transform(X[item])


#Displaying dataset statistics and visualizaitons here
populate_statistics(df)
df, X, y = populate_preprocessors(df, X, y)



#stats = add_parameter_ui(classifier_name)



#Defining dynamic parameter generation here
Expand Down Expand Up @@ -329,17 +317,6 @@ def add_parameters(clf_name):
return params



#Populating classification sidebar here
classifier_name = st.sidebar.selectbox(
'Select classifier',
('Naive Bayes', 'KNN', 'SVM', 'Random Forest', 'Decision Tree', 'Logistic Regression', 'Gradient Boosting Classifier', 'Artificial Neural Networks')
)
params = add_parameters(classifier_name)




#Instantiating the classifier selected in the sidebar
def get_classifier(clf_name, params):
clf = None
Expand All @@ -350,7 +327,7 @@ def get_classifier(clf_name, params):
if clf_name == 'Naive Bayes':
clf = GaussianNB()
if clf_name == 'Random Forest':
clf = clf = RandomForestClassifier(n_estimators=params['n_estimators'],
clf = RandomForestClassifier(n_estimators=params['n_estimators'],
max_depth=params['max_depth'], random_state=1234, min_samples_split=params['min_samples_split'], n_jobs=params['n_jobs'], criterion=params['criterion'])
if clf_name == 'Decision Tree':
clf = DecisionTreeClassifier(criterion=params['criterion'], splitter=params['splitter'], max_depth = params['max_depth'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'])
Expand All @@ -365,51 +342,127 @@ def get_classifier(clf_name, params):
from keras.layers import LSTM
def lstm():
model = Sequential()
model.add(Dense(1,input_dim=41,activation = 'relu',kernel_initializer='random_uniform'))
model.add(Dense(1,activation='sigmoid',kernel_initializer='random_uniform'))

#model.add(Dense(41,input_dim=41,activation = 'relu',kernel_initializer='random_uniform'))
#model.add(Dense(41,activation='sigmoid',kernel_initializer='random_uniform'))
model.add(LSTM((1),batch_input_shape=(None, 1, Dataframe.X_train.shape[1]), return_sequences=False))
#model.add(LSTM(units = 23, return_sequences = True, input_shape= (41, 1, 1)))
#model.add(LSTM(1, input_shape=(50, 41)))

model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
return model
clf = KerasClassifier(build_fn=lstm,epochs=1,batch_size=64)
clf = KerasClassifier(build_fn=lstm,epochs=10,batch_size=64)
return clf

clf = get_classifier(classifier_name, params)



#Defining prediction & accuracy metrics function here
def get_prediction():
if st.button('Classify'):
st.write('Train to Test Ratio = ', ratio)
st.write('Train to Test Ratio = ', Dataframe.ratio)

#Splitting training and testing dataframes here
Dataframe.X_train, Dataframe.X_test, Dataframe.y_train, Dataframe.y_test = train_test_split(Dataframe.X, Dataframe.y, test_size=0.2, random_state=1234)
clf = get_classifier(classifier_name, params)

#Reshape dataframes for ANN models
if(classifier_name == 'Artificial Neural Networks'):
Dataframe.X_train = np.reshape(np.ravel(Dataframe.X_train), (Dataframe.X_train.shape[0], 1, Dataframe.X_train.shape[1]))
Dataframe.X_test = np.reshape(np.ravel(Dataframe.X_test), (Dataframe.X_test.shape[0], 1, Dataframe.X_test.shape[1]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
#Start classifier fitting and evaluation
clf.fit(Dataframe.X_train, Dataframe.y_train)
Dataframe.y_pred = clf.predict(Dataframe.X_test)
acc = accuracy_score(Dataframe.y_test, Dataframe.y_pred)
st.write(f'Classifier = {classifier_name}')
st.write('Accuracy =', acc)
metrics = sklearn.metrics.classification_report(y_test, y_pred)
metrics = sklearn.metrics.classification_report(Dataframe.y_test, Dataframe.y_pred)
st.text(metrics)

else:
st.write('Click the button to classify')


#st.write(y_test.head())
#st.write(y_pred)

encoder = LabelEncoder()
num_cols = y_test._get_numeric_data().columns
cate_cols = list(set(y_test.columns)-set(num_cols))
for item in cate_cols:
y_test[item] = encoder.fit_transform(y_test[item])
def get_live_packet_prediction():
if st.button('Classify'):
st.write('Train to Test Ratio = ', Dataframe.ratio)

#Splitting training and testing dataframes here
Dataframe.X_train, Dataframe.X_test, Dataframe.y_train, Dataframe.y_test = train_test_split(Dataframe.X, Dataframe.y, test_size=0.2, random_state=1234)
clf = get_classifier(classifier_name, params)

#Reshape dataframes for ANN models
if(classifier_name == 'Artificial Neural Networks'):
Dataframe.X_train = np.reshape(np.ravel(Dataframe.X_train), (Dataframe.X_train.shape[0], 1, Dataframe.X_train.shape[1]))
Dataframe.X_test = np.reshape(np.ravel(Dataframe.X_test), (Dataframe.X_test.shape[0], 1, Dataframe.X_test.shape[1]))


#Import live packet capture data
path2 = "custom-train-data.csv"
Dataframe.X2_test = pd.read_csv(path2)
#Keep only common columns between training data and live packet capture dataframes
common_cols = [col for col in set(Dataframe.X_test.columns).intersection(Dataframe.X2_test.columns)]
Dataframe.X_test = Dataframe.X_test[common_cols]
Dataframe.X_train = Dataframe.X_train[common_cols]
#Label encode the imported live packet capture dataframe
num_cols2 = Dataframe.X2_test._get_numeric_data().columns
cate_cols2 = list(set(Dataframe.X2_test.columns)-set(num_cols2))
for item in cate_cols2:
Dataframe.X2_test[item] = encoder.fit_transform(Dataframe.X2_test[item])


#Start classifier fitting and evaluation
clf.fit(Dataframe.X_train, Dataframe.y_train)
Dataframe.y_pred = clf.predict(Dataframe.X_test)
acc = accuracy_score(Dataframe.y_test, Dataframe.y_pred)
st.write(f'Classifier = {classifier_name}')
st.write('Accuracy =', acc)
metrics = sklearn.metrics.classification_report(Dataframe.y_test, Dataframe.y_pred)
st.text(metrics)

y_pred = encoder.fit_transform(y_pred)


st.write(sklearn.metrics.roc_auc_score(y_test, y_pred, multi_class='ovr',axis=0))
#Predict and show attacks in live packet capture dataframe
Dataframe.y2_pred = clf.predict(Dataframe.X2_test)
st.write(Dataframe.y2_pred)
st.bar_chart(Dataframe.y2_pred)
unique_elements, counts_elements = np.unique(Dataframe.y2_pred, return_counts=True)
st.write("Frequency of unique values of the said array:")
st.text(np.asarray((unique_elements, counts_elements)))

else:
st.write('Click the button to classify')

uploaded_file = st.file_uploader("Choose a file")
if uploaded_file is not None:
#Reading uploading dataset csv file here
Dataframe.df = pd.read_csv(uploaded_file)

#Replace NaN/Infinite values with 0
Dataframe.df = Dataframe.df.fillna(0)
Dataframe.df = Dataframe.df.replace([np.inf, -np.inf], 0)

#Splitting x & y dataframes here
Dataframe.y = Dataframe.df.iloc[:,[-1]]
Dataframe.X = Dataframe.df.iloc[: , :-1]

#Label encoding categorical features here
encoder = LabelEncoder()
num_cols = Dataframe.X._get_numeric_data().columns
cate_cols = list(set(Dataframe.X.columns)-set(num_cols))
for item in cate_cols:
Dataframe.X[item] = encoder.fit_transform(Dataframe.X[item])


#Displaying dataset statistics and visualizaitons here
populate_statistics()
populate_preprocessors()


#Populating classification sidebar here
classifier_name = st.sidebar.selectbox(
'Select classifier',
('Naive Bayes', 'KNN', 'SVM', 'Random Forest', 'Decision Tree', 'Logistic Regression', 'Gradient Boosting Classifier', 'Artificial Neural Networks')
)
params = add_parameters(classifier_name)

get_prediction()

0 comments on commit 3cd674f

Please sign in to comment.