Improved performance and new functionalities

Code refactored Initial live packet attack prediction Improved Performance 2D and 3D data plots now available Fixed code errors Decreased code redundancy Transformed dataframes into a python global class design pattern
mohab-sameh · May 19, 2021 · 3cd674f · 3cd674f
1 parent 21fab9c
commit 3cd674f
Showing 1 changed file with 151 additions and 98 deletions.
diff --git a/Implementation/streamlit-demo/form-test.py b/Implementation/streamlit-demo/form-test.py
@@ -5,6 +5,9 @@
 import pandas as pd
 import sklearn
 
+
+
+
 import matplotlib.pyplot as plt
 import time
 from sklearn import datasets
@@ -20,11 +23,10 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import GradientBoostingClassifier
-
-
 from sklearn.metrics import accuracy_score
 
 
+import plotly.express as px
 
 
 
@@ -40,18 +42,19 @@
 st.write("hahah omg")
 
 """
-exec(program)
+#exec(program)
 
+class Dataframe:
+    ratio=0.3
 
-ratio=0.33
 
 #Defining visualization plot here
 def plot_column( col ):
-    if df[col].dtype == 'object':
+    if Dataframe.df[col].dtype == 'object':
         encoder=LabelEncoder()
-        df[col] = encoder.fit_transform(df[col])
+        Dataframe.df[col] = encoder.fit_transform(Dataframe.df[col])
     fig, ax = plt.subplots()
-    df.hist(
+    Dataframe.df.hist(
         bins=8,
         column= col ,
         grid=False,
@@ -67,15 +70,43 @@ def plot_column( col ):
 #@st.cache(suppress_st_warning=True)
 def write_statistics(statistics, visualizaitons):
     if 'Dataset Shape' in statistics:
-        st.write('Shape of Dataset:', X.shape)
+        st.write('Shape of Dataset:', Dataframe.df.shape)
     if 'Number of Classes' in statistics:
-        st.write('Number of Classes:', len(np.unique(y)))
+        st.write('Number of Classes:', len(np.unique(Dataframe.y)))
     if 'Dataset Head' in statistics:
-        st.write('Dataset Head:', df.head(5))
+        st.write('Dataset Head:', Dataframe.df.head(5))
     if 'Describe Features' in statistics:
-        st.write('Feature Description:', df.describe())
+        st.write('Feature Description:', Dataframe.df.describe())
     if 'View Packet Types' in statistics:
-        st.write('Packet Types:', np.unique(y))
+        st.write('Packet Types:', np.unique(Dataframe.y))
+    if 'Scatter Plots' in statistics:
+        st.subheader("Scatter Plot:")
+        plot_dim = st.selectbox("Select plot dimensionality", ('2D Plot', '3D Plot'))
+        with st.form('Scatter Plot Form'):
+            if(Dataframe.df.shape[0] < 200):
+                max_rows = len(Dataframe.df.index)
+            else:
+                max_rows = 200
+            num_samples = st.slider(label="Select number of random samples", min_value=1, max_value=max_rows)
+            sample_df = Dataframe.df.sample(num_samples)
+
+            if plot_dim == '2D Plot':
+                feature_x = st.selectbox('Select X-Axis Feature', (Dataframe.df.columns))
+                feature_y = st.selectbox('Select Y-Axis Feature', (Dataframe.df.columns))
+                if(feature_x and feature_y):
+                    fig = px.scatter(sample_df, x= feature_x, y = feature_y, color=sample_df.columns[len(sample_df.columns)-1])
+                    st.plotly_chart(fig)
+            if plot_dim == '3D Plot':
+                feature_x = st.selectbox('Select X-Axis Feature', (Dataframe.df.columns))
+                feature_y = st.selectbox('Select Y-Axis Feature', (Dataframe.df.columns))
+                feature_z = st.selectbox('Select Z-Axis Feature', (Dataframe.df.columns))
+                if(feature_x and feature_y):
+                    fig = px.scatter_3d(sample_df, feature_x, feature_y, feature_z, color = sample_df.columns[len(sample_df.columns)-1])
+                    st.plotly_chart(fig)
+
+            scatter_submit = st.form_submit_button('Apply Selected Options')
+
+
 
     if visualizaitons:
         write_visualizations(visualizaitons)
@@ -86,95 +117,52 @@ def write_visualizations(visualizaitons):
         plot_column(col=column)
 
 
-def populate_statistics(df):
+def populate_statistics():
     st.sidebar.header('Data Exploration')
     #Print statistics and visualizations sidebar items
     statistics=False
     visualizaitons=False
     with st.sidebar.form('Statistics Form'):
         statistics = st.multiselect(
             'Select Desired Statistics',
-            ('Dataset Head', 'Dataset Shape', 'Number of Classes', 'Describe Features', 'View Packet Types', 'Plot Feature Visualizations')
+            ('Dataset Head', 'Dataset Shape', 'Number of Classes', 'Describe Features', 'View Packet Types', 'Scatter Plots', 'Plot Feature Visualizations')
         )
         statistics_submit = st.form_submit_button('Show Selected Options')
 
     if 'Plot Feature Visualizations' in statistics:
         with st.sidebar.form('Visualizations Form'):
             visualizaitons = st.multiselect(
                 'Select Desired Visualizations',
-                (df.columns)
+                (Dataframe.df.columns)
             )
             visualizations_submit = st.form_submit_button('Show Selected Options')
 
     if statistics:
         write_statistics(statistics, visualizaitons)
 
 
-def populate_preprocessors(df, X, y):
+def populate_preprocessors():
     st.sidebar.header('Preprocessing')
 
     #Drop null values here:
     drop_nulls_btn = st.sidebar.checkbox('Drop Rows with Null Values')
     if drop_nulls_btn:
-        df = df.dropna(axis=0)
+        Dataframe.df = Dataframe.df.dropna(axis=0)
 
     #Print preprocessing sidebar items
     scaling_btn = st.sidebar.checkbox('Apply Logarithmic Scaling')
     if scaling_btn:
         #Applying logarithmic scaling here
         sc = MinMaxScaler()
-        X = sc.fit_transform(X)
+        Dataframe.X[Dataframe.X.columns] = sc.fit_transform(Dataframe.X)
 
     ratio_btn = st.sidebar.selectbox('Select Custom/Default Test-Train Ratio',('Default', 'Custom'))
-    global ratio
     if ratio_btn == 'Default':
-        ratio = 0.33
+        Dataframe.ratio = 0.33
     if ratio_btn == 'Custom':
-        ratio = st.sidebar.number_input('ratio', min_value=0.01, max_value=0.99)
-
+        Dataframe.ratio = st.sidebar.number_input('ratio', min_value=0.01, max_value=0.99)
 
 
-    return df, X, y
-
-
-uploaded_file = st.file_uploader("Choose a file")
-if uploaded_file is not None:
-    #Reading uploading dataset csv file here
-    df = pd.read_csv(uploaded_file)
-
-    #Check if dataframe has NaN/Infinite values
-    #st.write(df.isna().any())
-    #st.write(df.isin([np.inf, -np.inf]).any())
-
-
-
-    #Replace NaN/Infinite values with 0
-    df = df.fillna(0)
-    df = df.replace([np.inf, -np.inf], 0)
-
-
-    #Splitting x & y dataframes here
-    y = df.iloc[:,[-1]]
-    X = df.iloc[: , :-1]
-
-
-
-    #Label encoding categorical features here
-    encoder = LabelEncoder()
-    num_cols = X._get_numeric_data().columns
-    cate_cols = list(set(X.columns)-set(num_cols))
-    for item in cate_cols:
-        X[item] = encoder.fit_transform(X[item])
-
-
-    #Displaying dataset statistics and visualizaitons here
-    populate_statistics(df)
-    df, X, y = populate_preprocessors(df, X, y)
-
-
-
-#stats = add_parameter_ui(classifier_name)
-
 
 
 #Defining dynamic parameter generation here
@@ -329,17 +317,6 @@ def add_parameters(clf_name):
     return params
 
 
-
-#Populating classification sidebar here
-classifier_name = st.sidebar.selectbox(
-    'Select classifier',
-    ('Naive Bayes', 'KNN', 'SVM', 'Random Forest', 'Decision Tree', 'Logistic Regression', 'Gradient Boosting Classifier', 'Artificial Neural Networks')
-)
-params = add_parameters(classifier_name)
-
-
-
-
 #Instantiating the classifier selected in the sidebar
 def get_classifier(clf_name, params):
     clf = None
@@ -350,7 +327,7 @@ def get_classifier(clf_name, params):
     if clf_name == 'Naive Bayes':
         clf = GaussianNB()
     if clf_name == 'Random Forest':
-        clf = clf = RandomForestClassifier(n_estimators=params['n_estimators'], 
+        clf = RandomForestClassifier(n_estimators=params['n_estimators'], 
             max_depth=params['max_depth'], random_state=1234, min_samples_split=params['min_samples_split'], n_jobs=params['n_jobs'], criterion=params['criterion'])
     if clf_name == 'Decision Tree':
         clf = DecisionTreeClassifier(criterion=params['criterion'], splitter=params['splitter'], max_depth = params['max_depth'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'])
@@ -365,51 +342,127 @@ def get_classifier(clf_name, params):
         from keras.layers import LSTM
         def lstm():
             model = Sequential()
-            model.add(Dense(1,input_dim=41,activation = 'relu',kernel_initializer='random_uniform'))
-            model.add(Dense(1,activation='sigmoid',kernel_initializer='random_uniform'))
-
+            #model.add(Dense(41,input_dim=41,activation = 'relu',kernel_initializer='random_uniform'))
+            #model.add(Dense(41,activation='sigmoid',kernel_initializer='random_uniform'))
+            model.add(LSTM((1),batch_input_shape=(None, 1, Dataframe.X_train.shape[1]), return_sequences=False))
+            #model.add(LSTM(units = 23, return_sequences = True, input_shape= (41, 1, 1)))
             #model.add(LSTM(1, input_shape=(50, 41)))
+
             model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
             return model
-        clf = KerasClassifier(build_fn=lstm,epochs=1,batch_size=64)
+        clf = KerasClassifier(build_fn=lstm,epochs=10,batch_size=64)
     return clf
 
-clf = get_classifier(classifier_name, params)
-
 
 
 #Defining prediction & accuracy metrics function here
 def get_prediction():
         if st.button('Classify'):
-            st.write('Train to Test Ratio = ', ratio)
+            st.write('Train to Test Ratio = ', Dataframe.ratio)
 
+            #Splitting training and testing dataframes here
+            Dataframe.X_train, Dataframe.X_test, Dataframe.y_train, Dataframe.y_test = train_test_split(Dataframe.X, Dataframe.y, test_size=0.2, random_state=1234)
+            clf = get_classifier(classifier_name, params)
+
+            #Reshape dataframes for ANN models
+            if(classifier_name == 'Artificial Neural Networks'):
+                Dataframe.X_train = np.reshape(np.ravel(Dataframe.X_train), (Dataframe.X_train.shape[0], 1, Dataframe.X_train.shape[1]))
+                Dataframe.X_test = np.reshape(np.ravel(Dataframe.X_test), (Dataframe.X_test.shape[0], 1, Dataframe.X_test.shape[1]))
 
-            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
-            clf.fit(X_train, y_train)
-            y_pred = clf.predict(X_test)
-            acc = accuracy_score(y_test, y_pred)
+            #Start classifier fitting and evaluation
+            clf.fit(Dataframe.X_train, Dataframe.y_train)
+            Dataframe.y_pred = clf.predict(Dataframe.X_test)
+            acc = accuracy_score(Dataframe.y_test, Dataframe.y_pred)
             st.write(f'Classifier = {classifier_name}')
             st.write('Accuracy =', acc)
-            metrics = sklearn.metrics.classification_report(y_test, y_pred)
+            metrics = sklearn.metrics.classification_report(Dataframe.y_test, Dataframe.y_pred)
             st.text(metrics)
+
+        else: 
+            st.write('Click the button to classify')
 
 
-            #st.write(y_test.head())
-            #st.write(y_pred)
-
-            encoder = LabelEncoder()
-            num_cols = y_test._get_numeric_data().columns
-            cate_cols = list(set(y_test.columns)-set(num_cols))
-            for item in cate_cols:
-                y_test[item] = encoder.fit_transform(y_test[item])
+def get_live_packet_prediction():
+        if st.button('Classify'):
+            st.write('Train to Test Ratio = ', Dataframe.ratio)
+
+            #Splitting training and testing dataframes here
+            Dataframe.X_train, Dataframe.X_test, Dataframe.y_train, Dataframe.y_test = train_test_split(Dataframe.X, Dataframe.y, test_size=0.2, random_state=1234)
+            clf = get_classifier(classifier_name, params)
+
+            #Reshape dataframes for ANN models
+            if(classifier_name == 'Artificial Neural Networks'):
+                Dataframe.X_train = np.reshape(np.ravel(Dataframe.X_train), (Dataframe.X_train.shape[0], 1, Dataframe.X_train.shape[1]))
+                Dataframe.X_test = np.reshape(np.ravel(Dataframe.X_test), (Dataframe.X_test.shape[0], 1, Dataframe.X_test.shape[1]))
+
+
+            #Import live packet capture data
+            path2 = "custom-train-data.csv"
+            Dataframe.X2_test = pd.read_csv(path2)
+            #Keep only common columns between training data and live packet capture dataframes
+            common_cols = [col for col in set(Dataframe.X_test.columns).intersection(Dataframe.X2_test.columns)]
+            Dataframe.X_test = Dataframe.X_test[common_cols]
+            Dataframe.X_train = Dataframe.X_train[common_cols]
+            #Label encode the imported live packet capture dataframe
+            num_cols2 = Dataframe.X2_test._get_numeric_data().columns
+            cate_cols2 = list(set(Dataframe.X2_test.columns)-set(num_cols2))
+            for item in cate_cols2:
+                Dataframe.X2_test[item] = encoder.fit_transform(Dataframe.X2_test[item])
+
+
+            #Start classifier fitting and evaluation
+            clf.fit(Dataframe.X_train, Dataframe.y_train)
+            Dataframe.y_pred = clf.predict(Dataframe.X_test)
+            acc = accuracy_score(Dataframe.y_test, Dataframe.y_pred)
+            st.write(f'Classifier = {classifier_name}')
+            st.write('Accuracy =', acc)
+            metrics = sklearn.metrics.classification_report(Dataframe.y_test, Dataframe.y_pred)
+            st.text(metrics)
 
-            y_pred = encoder.fit_transform(y_pred)
-
 
-            st.write(sklearn.metrics.roc_auc_score(y_test, y_pred, multi_class='ovr',axis=0))
+            #Predict and show attacks in live packet capture dataframe
+            Dataframe.y2_pred = clf.predict(Dataframe.X2_test)
+            st.write(Dataframe.y2_pred)
+            st.bar_chart(Dataframe.y2_pred)
+            unique_elements, counts_elements = np.unique(Dataframe.y2_pred, return_counts=True)
+            st.write("Frequency of unique values of the said array:")
+            st.text(np.asarray((unique_elements, counts_elements)))
 
         else: 
             st.write('Click the button to classify')
 
+uploaded_file = st.file_uploader("Choose a file")
+if uploaded_file is not None:
+    #Reading uploading dataset csv file here
+    Dataframe.df = pd.read_csv(uploaded_file)
+
+    #Replace NaN/Infinite values with 0
+    Dataframe.df = Dataframe.df.fillna(0)
+    Dataframe.df = Dataframe.df.replace([np.inf, -np.inf], 0)
+
+    #Splitting x & y dataframes here
+    Dataframe.y = Dataframe.df.iloc[:,[-1]]
+    Dataframe.X = Dataframe.df.iloc[: , :-1]
+
+    #Label encoding categorical features here
+    encoder = LabelEncoder()
+    num_cols = Dataframe.X._get_numeric_data().columns
+    cate_cols = list(set(Dataframe.X.columns)-set(num_cols))
+    for item in cate_cols:
+        Dataframe.X[item] = encoder.fit_transform(Dataframe.X[item])
+
+
+    #Displaying dataset statistics and visualizaitons here
+    populate_statistics()
+    populate_preprocessors()
+
+
+#Populating classification sidebar here
+classifier_name = st.sidebar.selectbox(
+    'Select classifier',
+    ('Naive Bayes', 'KNN', 'SVM', 'Random Forest', 'Decision Tree', 'Logistic Regression', 'Gradient Boosting Classifier', 'Artificial Neural Networks')
+)
+params = add_parameters(classifier_name)
+
 get_prediction()