getSentiment.py

import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
import math

import re
import itertools
from collections import Counter

import random
import nltk
import collections
import word2vec
import sys


def cleanForView(string):
    stringA = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    return stringA.strip().lower()
    
def getSentimentCNN(fileToLoad, modelDir):
    checkpoint_dir = "./rnn_runs/"+modelDir+"/checkpoints/"
    batch_size = 64
    x_test, y_test, vocabulary, vocabulary_inv,trainS = data_helpers.load_data_for_books("./data/"+fileToLoad+".txt")
    y_test = np.argmax(y_test, axis=1)
    print("Vocabulary size: {:d}".format(len(vocabulary)))
    print("Test set size {:d}".format(len(y_test)))
    
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name("output/predictions").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            # Generate batches for one epoch
            batches = data_helpers.batch_iter(x_test, batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []
            all_scores = []
            for x_test_batch in batches:
                batch_scores = sess.run(scores, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                batch_predictions = np.argmax(batch_scores,axis=1)
                #batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
                all_predictions = np.concatenate([all_predictions, batch_predictions])
                all_scores = np.concatenate([all_scores,batch_scores[:,1] - batch_scores[:,0]])
                
    mbs = float(len(all_predictions[all_predictions == 1]))/len(all_predictions)
    mss = np.mean(all_scores)
    print "Mean Binary Sentiment",mbs
    print "Mean Smooth Sentiment",mss
    return all_predictions,all_scores
    
def getSentimentRNN(fileToLoad,modelDir):
    checkpoint_dir = "./rnn_runs/"+modelDir+"/checkpoints/"
    batch_size = 64
    n_hidden = 256
    
    x_test, y_test, vocabulary, vocabulary_inv,trainS = data_helpers.load_data_for_books("./data/"+fileToLoad+".txt")
    y_test = np.argmax(y_test, axis=1)
    print("Vocabulary size: {:d}".format(len(vocabulary)))
    print("Test set size {:d}".format(len(y_test)))
    x_test = np.fliplr(x_test)
    
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=False)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            print("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("x_input").outputs[0]
            predictions = graph.get_operation_by_name("prediction").outputs[0]
            istate = graph.get_operation_by_name('initial_state').outputs[0]
            keep_prob = graph.get_operation_by_name('keep_prob').outputs[0]
            # Generate batches for one epoch
            batches = data_helpers.batch_iter(x_test, batch_size, 1, shuffle=False)

            # Collect the predictions here
            all_predictions = []
            all_scores = []
            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {input_x: x_test_batch, istate: np.zeros((len(x_test_batch), 2*n_hidden)), keep_prob: 1.0})
                binaryPred = np.argmax(batch_predictions,axis=1)
                all_predictions = np.concatenate([all_predictions, binaryPred])
                all_scores = np.concatenate([all_scores, batch_predictions[:,1] - batch_predictions[:,0]])
                
        mbs = float(len(all_predictions[all_predictions == 1]))/len(all_predictions)
        mss = np.mean(all_scores)
        print "Mean Binary Sentiment",mbs
        print "Mean Smooth Sentiment",mss
        return all_predictions,all_scores
        

def saveSentiment(fileToSave,all_predictions,all_scores):
    text = ''.join(open("./data/"+fileToSave+".txt").readlines()).decode('utf8')

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    book = tokenizer.tokenize(text)
    book = [cleanForView(sent) for sent in book]
    toOut = zip(book,all_predictions,all_scores)
    
    import unicodecsv as csv
    myfile = open(fileToSave+'.csv', 'wb')
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(["Text","Binary_Sentiment","Cont_Sentiment"])
    for row in toOut:
        wr.writerow(row)
    print "Saved",fileToSave+'.csv'
    
arguments = sys.argv
book = arguments[1]
nntype = arguments[2]
modelDir = arguments[3]

if nntype == "CNN":
    all_predictions,all_scores = getSentimentCNN(book,modelDir)
    saveSentiment(book,all_predictions,all_scores)
elif nntype == "RNN":
    all_predictions,all_scores = getSentimentRNN(book,modelDir)
    saveSentiment(book,all_predictions,all_scores)
else:
    print "Please choose a neural network type: CNN or RNN"