-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalytics.py
69 lines (64 loc) · 1.95 KB
/
analytics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#Analytics
import csv
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.text import Text
import pickle
import sys
import os
import string
from nltk.corpus import stopwords
from nltk.tag import *
from string import punctuation
from collections import defaultdict
class FrequencySummarizer:
#ranking = 0
def __init__(self, min_cut=0.1, max_cut=0.9):
self._min_cut = min_cut
self._max_cut = max_cut
self._stopwords = set(stopwords.words('english') + list(punctuation))
def _words(self,path):
with open(path, 'rb') as f:
reader = csv.reader(f)
test= [row for row in reader]
freq = defaultdict(int)
#return test
for i in range (0, len(test)) :
try:
first = str(test[i]).split(',')[0]
second = str(test[i]).split(',')[1]
freq[first]= second
#print freq
except ValueError:
print("Error Value.")
return freq
#print freq
def summarize(self,lines,path):
#ranking = defaultdict(int)
ranking =0
self._freq = self._words(path)
#print self._freq
#print self._freq.values()
#for k, v in self._freq.items():
# print k
# if k == "['amisha'":
# new_name =v
# print k, v, new_name.strip("]")
# else:
# print "no need"
for i,sent in enumerate(word_tokenize(lines)):
for w in sent.split():
allwords= "["+w, self._freq.get("["+"'"+w+"'")
print allwords
if w in allwords.split(','):
print w
#FrequencySummarizer.ranking += self._freq.get("["+"'"+w+"'")
#print ranking
path='/home/amisha/Desktop/sourcedata/dictionary.csv'
#words(path)
fs = FrequencySummarizer()
summary = fs.summarize('amisha bycycle original english.',path)
#summary = fs._words(path)
#print summary
#print "Hello World"