FFNN_LSTM.py

# -*- coding: utf-8 -*-
"""Copy of Deep Learning 1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1qMUzD8evPxRW9bEY0DGsFXBbgNphY5ho
"""

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import math
import time
import numpy as np
import sys
import argparse
import os
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import tensorflow as tf
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd

def readFile(file):
  f = open(file, "r")
  return(f)
  f.close()

def preprocessing(readFile, train):
  fileRead = []
  l = []

  for i in readFile.split():
     if i == "<end_bio>":
       fileRead.append(l)
       l = []
     else:
       l.append(i)

  wordsFiltered = []
  l2 = []
  labels = []
  l3 = []
  stops = set(stopwords.words('english'))

  for bio in fileRead:
    for words in bio:
      if words == "[FAKE]" or words == "[REAL]":
        wordsFiltered.append(l2)
        l2 = []
        l3.append(words.lower())
        labels.append(l3)
        l3 = []
      elif(words == "<start_bio>"):
        pass
      elif words not in stops:
        new = re.sub(r"[^0-9a-zA-Z \t]","",words)
        if len(new)>0:
          l2.append(new.lower())
        else:
          pass
      
      
  z = max(wordsFiltered, key = len)

  if train == 1:

    lenPadding = len(z)

    wordsFiltered = list(map(lambda x: x + [" "] * (lenPadding - len(x)), wordsFiltered))
   
    wordsFiltered = np.array(wordsFiltered, dtype=object)
    wordsFiltered = wordsFiltered[:,:256]

    labels = np.array(labels, dtype=object)
    y = (labels[:,0] == ['[real]']).astype("int")
    y = y.reshape((-1,1))

    return(wordsFiltered, y, lenPadding)

  else:
    lenPadding = train

    if (len(z) > lenPadding):

      for i in range(len(wordsFiltered)):
        if len(wordsFiltered[i]) > lenPadding:
          wordsFiltered[i] = wordsFiltered[i][:lenPadding]
        else:
          wordsFiltered[i] =  wordsFiltered[i] + [" "] * (lenPadding - len(i)) 


    else:
      wordsFiltered = list(map(lambda x: x + [" "] * (lenPadding - len(x)), wordsFiltered))
    
    wordsFiltered = np.array(wordsFiltered, dtype=object)
    wordsFiltered = wordsFiltered[:,:256]

    labels = np.array(labels, dtype=object)
    y = (labels[:,0] == ['[real]']).astype("int")
    y = y.reshape((-1,1))


    return(wordsFiltered, y)

def real_fake(wordsFiltered, train):
  y = (wordsFiltered[:,-1] == '[real]').astype("float")
  x = wordsFiltered[:,:train]
  y = y.reshape((-1,1))
  return(x,y)

def dictionary_int(x):
  d = {}
  c = 1
  for i in x.flatten():
    if i in d.keys():
      d[i][1] = d[i][1] + 1
    else:
      d[i] = [c,1]
      c = c+1

  # return(d)
  dValues = list(d.values())
  dValues.sort(key = lambda x: x[1])
  dValues.reverse()
  dValues = dValues[:]
  finalDict = {}
  dKeys = list(d.keys())

  for k in dValues:
    position = k[0] - 2
    finalDict[dKeys[position]] = k

  return(finalDict)

def dfr(x):
  try:
    return d[x][0]
  except:
    return 0

def train():
    z = readFile("mix.train.txt")
    z = z.read()
    x,y,lenpadding = preprocessing(z, 1)
    # x,y = real_fake(wordsFiltered, lenpadding)
    d = dictionary_int(x)
    
    return(y,d,lenpadding,x)

def valid():
    z = readFile("mix.valid.txt")
    z = z.read()
    x,y = preprocessing(z, lenpadding)
    # x,y = real_fake(wordsFiltered, lenpadding)
    applyall = np.vectorize(dfr)
    X  = applyall(x)
    return(X,y,x)

def test():
    z = readFile("mix.test.txt")
    z = z.read()
    x,y = preprocessing(z, lenpadding)
    # x,y = real_fake(wordsFiltered, lenpadding)
    applyall = np.vectorize(dfr)
    X  = applyall(x)
    return(X,y)

def LSTM_Model(X,d):
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(input_length=X.shape[1],  input_dim=len(d)+1, output_dim=128))
  model.add(tf.keras.layers.LSTM(units=50, return_sequences=True, kernel_initializer='glorot_uniform'))
  model.add(tf.keras.layers.LSTM(units=50, kernel_initializer='glorot_uniform'))
  model.add(tf.keras.layers.Dense(units=64, activation = "relu"))
  model.add(tf.keras.layers.Dense(units=1, activation = "sigmoid"))
  model.compile(optimizer = tf.keras.optimizers.Adam(8e-5), loss = tf.keras.losses.BinaryCrossentropy(), metrics = [tf.keras.metrics.BinaryAccuracy()])
  return model

def FFNN_Model(X,d):
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(input_length=X.shape[1],  input_dim=len(d)+1, output_dim=128))
  model.add(tf.keras.layers.Dense(units=128, activation = "relu"))
  model.add(tf.keras.layers.Dense(units=64, activation = "relu"))
  model.add(tf.keras.layers.Dense(units=1, activation = "relu"))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(units=1, activation = "sigmoid"))


  model.compile(optimizer = tf.keras.optimizers.Adam(8e-5), loss = tf.keras.losses.BinaryCrossentropy(), metrics = [tf.keras.metrics.BinaryAccuracy()])
  return model

def Model_Summary(model):
  model.summary()

def LSTM_Training(model,X,y,Xv,yv):
  history = model.fit(X,y,batch_size = 32, epochs = 4, validation_data = (Xv,yv), shuffle = True)
  return history

def FFNN_Training(model,X,y,Xv,yv):
  history = model.fit(X,y,batch_size = 32, epochs = 4, validation_data = (Xv,yv), shuffle = True)
  return history

def Model_Predict_LSTM(model, Xt, yt):
  yp = model.predict(Xt)
  print("yp",yp)
  yp = yp.round()
  accuracy = accuracy_score(yt, yp)
  return(accuracy,yp)

def Model_Predict_FFNN(model, Xt, yt):
  yp = model.predict(Xt)
  print("yp",yp)
 
  yp = yp.round()
  accuracy = accuracy_score(yt, yp)
  return(accuracy,yp)

def History(history):
  history.history

def Model_Learning_Curves(history):
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Model Learning Curves')
  plt.ylabel('BinaryCrossentropy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Validation'], loc='upper right')
  plt.show()

def ConfusionMatrix(yt, yp):
  cm = confusion_matrix(yt, yp)
  df_cm = pd.DataFrame(cm, columns=['Predicted Real', 'Predicted Fake'], index=['Actual Real', 'Actual Fake'])
  plt.figure(figsize = (6,4))
  plt.title('Fake Detection Confusion Matrix')
  sns.heatmap(df_cm, annot=True, cmap='Blues', fmt='g')
  plt.show()

def LSTM_Flow(X,y,Xv,yv,Xt,yt,d):
  LSTM_model = LSTM_Model(X,d)
  Model_Summary(LSTM_model)
  LSTM_Train = LSTM_Training(LSTM_model,X,y,Xv,yv)
  LSTM_accuracy, yp = Model_Predict_LSTM(LSTM_model, Xt, yt)
  print("LSTM Accuracy :",LSTM_accuracy)
  LSTM_history = History(LSTM_Train)
  LSTM_Graph = Model_Learning_Curves(LSTM_Train)
  LSTM_CM = ConfusionMatrix(yt, yp)

def FFNN_Flow(X,y,Xv,yv,Xt,yt,d):
  FFNN_model = FFNN_Model(X,d)
  Model_Summary(FFNN_model)
  FFNN_Train = FFNN_Training(FFNN_model,X,y,Xv,yv)
  FFNN_accuracy, yp = Model_Predict_FFNN(FFNN_model, Xt, yt)
  print("FFNN Accuracy :",FFNN_accuracy)
  FFNN_history = History(FFNN_Train)
  FFNN_Graph = Model_Learning_Curves(FFNN_Train)
  FFNN_CM = ConfusionMatrix(yt, yp)

if __name__ == "__main__":
  y,d,lenpadding,x = train()

  applyall = np.vectorize(dfr)
  X  = applyall(x)

  Xv,yv,xv = valid()
  Xt,yt = test()
  
  FFNN_Flow(X,y,Xv,yv,Xt,yt,d)

  LSTM_Flow(X,y,Xv,yv,Xt,yt,d)