-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTF-IDF.py
90 lines (81 loc) · 2.33 KB
/
TF-IDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
'''
Created on 2018-02-15
@author: mani
'''
import sys,os
import glob
import collections
import math
from functools32 import lru_cache
from multiprocessing import Pool
CORPUS = {}
TOTAL_DOCS = 0
def TF(term,doc):
if doc in CORPUS.keys():
if CORPUS[doc][term]:
return float(CORPUS[doc][term])/float(TOTAL_DOCS)
else:
return 0
return 0
def IDF(term):
return math.log(float(TOTAL_DOCS)/len(search_doc_with_term(term)))
def TF_IDF(term,doc):
tf = TF(term,doc)
idf = IDF(term)
return tf*idf
def create_count(text):
text = text.lower()
split_data = text.split(" ")
count_dict = collections.Counter(split_data)
return count_dict
def search_doc_with_term(term):
tmp = []
for f, dic in CORPUS.iteritems():
if term in dic.keys():
tmp.append(f)
return tmp
def load_corpus(dirc):
global CORPUS
global TOTAL_DOCS
for d in dirc:
for filename in glob.glob(d+'/*'):
if os.path.isfile(filename):
with open(filename) as fd:
CORPUS[filename] = create_count(fd.read())
TOTAL_DOCS = len(CORPUS)
@lru_cache(maxsize=1024)
def search_term(term,top=10):
search_result = []
for doc in CORPUS.keys():
result = TF_IDF(term, doc)
search_result.append(tuple([doc,result]))
search_result.sort(key=lambda x:x[1],reverse=True)
return search_result[:top]
def test_create_count():
tmp = None
with open("review_polarity/txt_sentoken/pos/cv000_29590.txt") as fd:
tmp = fd.read()
for i,v in create_count(tmp).iteritems():
print i,v
sys.exit()
if __name__ == '__main__':
#test_create_count()
if len(sys.argv) < 2:
print "Please provide input directory!"
print "Usage: TF-IDF.py [dir1] [dir2] ..."
sys.exit()
else:
for d in sys.argv[1:]:
if os.path.isdir(d):
pass
else:
print d," is not a directory!"
sys.exit()
load_corpus(sys.argv[1:])
print "Total documents in the Corpus:",len(CORPUS.keys())
while True:
term = raw_input("Please enter the Term to search in the Corpus:")
result = search_term(term.lower())
for res in result:
#print chr(27) + "[2J"
print res[1],res[0]