datam_and_tfidf.py

# -*- coding: utf-8 -*-
"""DataM and TFIDF.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_0ISRF6dv6WyKR5tsVV86fCtBJwP8b-d

# IMPORTS AND VISUALIZATION
"""

from google.colab import drive
drive.mount('/content/drive')

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
#from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
#from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
# %matplotlib inline
from nltk.tokenize import word_tokenize
from __future__ import unicode_literals
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import nltk
nltk.download("wordnet")
!pip install hazm
from hazm import *


# Keras
from keras import optimizers
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Embedding, Dropout
from keras.layers import GlobalMaxPool1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, LSTM, Bidirectional
from keras.layers.convolutional import Conv1D
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

"""**Read data**"""

import pandas as pd
df = pd.read_excel("/content/sample_data/dataset for clustering.xlsx")
df

# df = df.dropna(subset = ["Comment"], inplace=True)
sns.countplot(df.Label)
plt.xlabel('Label')
plt.title('Number of label ')

df.head()

"""# NORMALIZATION"""

import nltk
nltk.download('omw-1.4')
import re
from nltk.corpus import wordnet
#حذف کاراکتر های تکراری
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp=re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl=r'\1\2\3'
    def replace(self,word):
        if(wordnet.synsets(word)):
            return word
        repl_word=self.repeat_regexp.sub(self.repl,word)
        if(repl_word!=word):
            return self.replace(repl_word)
        else:
            return repl_word


replacer=RepeatReplacer()


def clean(text):
  #حذف الفبای انگلیسی .....حذف ارقام.......حذف کاراکتر های بی استفاده
  unwanted_digit=['0','1','2','3','4','5','6','7','8','9','۰',
                  '۱','۲','٣','۴','۵','۶','۷','۸','۹']#۰۱۲۳۴۵۶۷۸۹
  for digit in unwanted_digit:
    text=text.replace(digit,"")

  unwanted_alpha=['a','b','c','d','e','f','g','h','i','j',
                  'k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
  for alpha in unwanted_alpha:
    text=text.replace(alpha,"")
  
  unwanted_punc=["؟",'،','"',"'",'=','@','&','%','.',',',':','\\','$','^',
                 
                 '<','>','!','?','{','}',';','\n','\t',

                 '(',')','[',']','/','*','+','#','\u200c',

                 '\ufeff','-','_','|',"u200c","\u200c"]
  for punc in unwanted_punc:
    text=text.replace(punc,"")
  return text


#نرمال سازی جملات با استفاده از هضم اصلاح نیم فاصله ها و تبدیل کلمات جمع به حالت مفرد
def normalizeWhazm(text):
  normalizer = Normalizer()
  stemmer = Stemmer()
  lema=Lemmatizer()

  text=normalizer.normalize(text)
  text=lema.lemmatize(text)
  text=stemmer.stem(text)
  return text


def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


for x in df.index:
    #print(x)
    text = df.loc[x, "Comment"]
    #print(text)
    df.loc[x, "Comment"] = clean(text)
    df.loc[x, "Comment"] = replacer.replace(df.loc[x, "Comment"])
    df.loc[x, "Comment"] = normalizeWhazm(df.loc[x, "Comment"])
    df.loc[x, "Comment"] =remove_emoji(df.loc[x, "Comment"])

    print(df.loc[x, "Comment"])


df.head()

"""# DELETE STOP WORDS"""

path2='/content/sample_data/my_stop_words (1).txt'
data = pd.read_csv(path2, header = None)
#در ابتدا هر سطر را توکنایز میکنیم بعد استاپ وورد ها را حذف میکنیم و درون دیتاست قرار میدهیم
def TokDelStopW(text):
  z=[]
  text=word_tokenize(text)

  for i in range(len(text)):
    
    for x in data.index:
      
      if data.loc[x,0] == text[i]:
        z.append(i)
        break
        
  temp=[]
  for j in z:
    temp.append(text[j])
  text = [x for x in text if x not in temp]
  return " ".join(text)
  
for x in df.index: 
  df.loc[x, "Comment"] = TokDelStopW(df.loc[x, "Comment"])
  print(df.loc[x, "Comment"])

df.to_csv('clean_data.csv')

"""# BILD VOCABULARY

define vocabulary class
"""

#The first thing to do is to create values for our start of sentence,
# end of sentence, and sentence padding special tokens

class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    SOS_token = 1   # Start-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}  #a dictionary to hold word token to corresponding word index values
        self.word2count = {}  #a dictionary to hold individual word counts (tokens, actually) in the corpus
        self.index2word = { 0: "PAD", 1: "SOS", 2: "EOS"}  
              #a dictionary holding the reverse of word2index (word index keys to word token values); special tokens added right away
        self.num_words = 3             #this will be a count of the number of words (tokens, actually) in the corpus
        self.num_sentences = 0         # this will be a count of the number of sentences 
                                       #(text chunks of any indiscriminate length, actually) in the corpus
        self.longest_sentence = 0      # this will be the length of the longest corpus sentence by number of tokens

    def add_word(self, word):
        if word not in self.word2index:
                                        # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
                                          # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0
        for word in sentence.split(' '):
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

"""instantiation a vocabulary class for whole dataset"""

voc_Doc = Vocabulary('doc')
print(voc_Doc)

for x in df.index:
    #print(x)
    text = df.loc[x, "Comment"]
    print(text)
    voc_Doc.add_sentence(text)

print(f"number of sentences in whole document = {voc_Doc.num_sentences}")
print(f'number of unique word in whole document = {voc_Doc.num_words}')

sortedL_doc=sorted(voc_Doc.word2count.items(), key=lambda kv:
                 ( kv[1] , kv[0]))
print(sortedL_doc)

vocab=[ sortedL_doc[i][0]  for i in range(len(sortedL_doc)) ]
vector=[ sortedL_doc[i][1]  for i in range(len(sortedL_doc)) ]

"""**select negative comments and count vocabularies of it**"""

neg_doc = df.loc[df['Label'] == 1]

neg_doc.to_csv(r'c:\data\negative.txt', header=None, index=None, sep='\t', mode='a')

neg_Doc_v = Vocabulary('neg_doc_v')

for x in neg_doc.index:
    #print(x)
    text = neg_doc.loc[x, "Comment"]
    #print(text)
    neg_Doc_v.add_sentence(text)
neg_doc.head(10)

print(f"number of sentences in whole documents = {neg_Doc_v.num_sentences}")
print(f'number of unique word in whole documents = {neg_Doc_v.num_words}')


sortedL_neg_Doc=sorted(neg_Doc_v.word2count.items(), key=lambda kv:
                 ( kv[1] , kv[0]))


vocab_neg=[ sortedL_neg_Doc[i][0]  for i in range(len(sortedL_neg_Doc)) ]
vector_neg=[ sortedL_neg_Doc[i][1]  for i in range(len(sortedL_neg_Doc)) ]
print(sortedL_neg_Doc)

"""**select posetive comments and count vocabularis of it**"""

pos_doc = df.loc[df['Label'] == 3]
pos_doc.to_csv(r'c:\data\posetive.txt', header=None, index=None, sep='\t', mode='a')
df.to_csv(r'c:\data\df.txt', header=None, index=None, sep='\t', mode='a')
pos_Doc_v = Vocabulary('pos_doc_v')

for x in pos_doc.index:
    #print(x)
    text = pos_doc.loc[x, "Comment"]
    #print(text)
    pos_Doc_v.add_sentence(text)
pos_doc.head(10)


print(f"number of sentences in whole document = {pos_Doc_v.num_sentences}")
print(f'number of unique word in whole document = {pos_Doc_v.num_words}')


sortedL_pos_Doc=sorted(pos_Doc_v.word2count.items(), key=lambda kv:
                 ( kv[1] , kv[0]))


vocab_pos=[ sortedL_pos_Doc[i][0]  for i in range(len(sortedL_pos_Doc)) ]
vector_pos=[ sortedL_pos_Doc[i][1]  for i in range(len(sortedL_pos_Doc)) ]
print(sortedL_pos_Doc)

"""**Counting number of unic words in whole dataset, negative and posetive data**"""

N_Fre_w = vocab[560:]
new_df = pd.DataFrame(dict([  (w,[0,0,0 ])  for w in N_Fre_w ]))
new_df.iloc[0]=[ voc_Doc.word2count[i]  for i in new_df.columns.values.tolist()]

from numpy.ma.core import append

ll=[]
for i in new_df.columns.values.tolist():

  if i not in pos_Doc_v.word2count.keys():
    ll.append(0)
  else:
    ll.append(pos_Doc_v.word2count[i])


new_df.iloc[1]=ll

from numpy.ma.core import append

ll=[]
for i in new_df.columns.values.tolist():

  if i not in neg_Doc_v.word2count.keys():
    ll.append(0)
  else:
    ll.append(neg_Doc_v.word2count[i])


new_df.iloc[2]=ll

new_df

"""**Plot distribution of each word in posetive and negative data**"""

cnt=dict(new_df.loc[1:,"غذا"])

labels = list(cnt.keys())
sizes = list(cnt.values())
colors = ['#3fba36', '#66b3ff','#ffcc99','#ff9999', '#d44444']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
# Equal aspect ratio ensures that pie is drawn as a circle
ax1.axis('equal')  
plt.tight_layout()
# Decomment following line if you want to save the figure
# plt.savefig('distribution.png')
plt.show()

for i in new_df.columns.values.tolist():
  print(i)
  cnt=dict(new_df.loc[1:,i])
  labels = list(cnt.keys())
  sizes = list(cnt.values())
  colors = ['#3fba36', '#66b3ff','#ffcc99','#ff9999', '#d44444']
  fig1, ax1 = plt.subplots()
  ax1.pie(sizes, labels=labels, colors=colors,
          autopct='%1.1f%%', startangle=90)
  #draw circle
  centre_circle = plt.Circle((0,0),0.70,fc='white')
  fig = plt.gcf()
  fig.gca().add_artist(centre_circle)
  # Equal aspect ratio ensures that pie is drawn as a circle
  ax1.axis('equal')  
  plt.tight_layout()
  # Decomment following line if you want to save the figure
  # plt.savefig('distribution.png')
  plt.show()

def BAR_PLOT(X,Y):
  plt.figure(figsize=(40, 10))
  plt.title("frequent word in document ")
  plt.xlabel("Category")
  plt.ylabel("Number of word in document ")
  plt.xticks(rotation=90)
  sns.barplot(X, Y)
  plt.show()

BAR_PLOT(vocab[570:],vector[570:])

BAR_PLOT(vocab_pos[210:],vector_pos[210:])

BAR_PLOT(vocab_neg[210:],vector_neg[210:])

"""**Word** **Clouds**"""

!pip install persian_wordcloud
!pip install wordcloud-fa
!sudo apt-get install python3-dev

from wordcloud_fa import WordCloudFa

wodcloud = WordCloudFa(no_reshape=False, persian_normalize=True, include_numbers=False, collocations=False, width=800, height=400)

with open('/content/sample_data/c__data_df.txt', 'r') as file:

     text = file.read()


wc = wodcloud.generate(text)
image = wc.to_image()
image.show()
image.save('dfdata.png')

from wordcloud_fa import WordCloudFa

wodcloud = WordCloudFa(no_reshape=False, persian_normalize=True, include_numbers=False, collocations=False, width=800, height=400)

with open('/content/sample_data/c__data_posetive (2).txt', 'r') as file:

     text = file.read()


wc = wodcloud.generate(text)
image = wc.to_image()
image.show()
image.save('posdata.png')

from wordcloud_fa import WordCloudFa

wodcloud = WordCloudFa(no_reshape=False, persian_normalize=True, include_numbers=False, collocations=False, width=800, height=400)
#text = "سلام"
with open('/content/sample_data/c__data_negative (1).txt', 'r') as file:

     text = file.read()

wc = wodcloud.generate(text)
image = wc.to_image()
image.show()
image.save('negdata.png')

"""**TFIDF**"""

#Define a function to calculate Term Frequency
#Term Frequency
def termfreq(document, word):
    
    N = len(document.split())
    #print(N)
    occurance = len([token for token in document.split() if token == word])
    return occurance/N

#Inverse Document Frequency
#Define a function calculate Inverse Document Frequency         neg_Doc_v.word2count
def inverse_doc_freq(word,doc):
    try:
        word_occurance = doc.word2count[word] + 1
    except:
        word_occurance = 1
    return np.log(doc.num_sentences/word_occurance)

#Combining the TF-IDF functions

def tf_idf(sentence,doc):
    tf_idf_vec = np.zeros((doc.num_words,))
    for word in sentence.split():
        tf = termfreq(sentence,word)
        idf = inverse_doc_freq(word,doc)
         
        value = tf*idf
        tf_idf_vec[doc.word2index[word]] = value 
    return tf_idf_vec

#Apply the TF-IDF Model to our text
#TF-IDF Encoded text corpus     df.index         df.loc[x, "Comment"]
vectors = []
for x in df.index :
    vec = tf_idf(df.loc[x, "Comment"],voc_Doc)
    vectors.append(vec)
 
print(vectors[40])
print(len(vectors))