-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib_clusterizer.py
126 lines (120 loc) · 5.56 KB
/
lib_clusterizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# coding: utf-8
import datetime
class clusterizer:
def __init__( self,
db=None,
start_date = datetime.date.today(),
end_date = datetime.datetime.today(),
field = 'doc_title',
threshold = 0.7
):
self.db = db
self.start_date = start_date
self.end_date = end_date
self.field = field
self.threshold = threshold
self.found = []
def run(self):
started_time = datetime.datetime.now()
self.found = []
if self.db:
cur = self.db.cursor()
sql = "select id, doc_date, "+self.field + ", doc_link, string_to_array(cast(plainto_tsquery(" \
+ self.field + ") as text), ' & '), source_id " \
+ "from docs where doc_date >= %s and doc_date <= %s " \
+ "order by doc_date, id"
params = (self.start_date, self.end_date)
cur.execute(sql, params)
res = cur.fetchall()
cur.close()
self.db.commit()
#
for i in range(len(res)-1):
cluster = []
ids = []
cluster.append( {'id':res[i][0], 'date':res[i][1], self.field:res[i][2], 'link':res[i][3]} )
ids.append(res[i][0])
for j in range(i+1, len(res)):
if res[i][5] <> res[j][5]:
count = 0
coeff = 0
for k in res[i][4]:
if k in res[j][4]:
count +=1
total_len = len(res[i][4]) + len(res[j][4])
if total_len == 0:
coeff = 0
else:
coeff = 2.0*count / total_len
if coeff >= self.threshold:
cluster.append( {'id':res[j][0], 'date':res[j][1], self.field:res[j][2], 'link':res[j][3]} )
ids.append( res[j][0] )
if len(cluster) > 1:
found_cluster = []
for n in ids:
for k in self.found:
if n in k['ids']:
found_cluster = k
break
if found_cluster:
found_cluster['cluster'].extend( [c for c in cluster if c['id'] not in found_cluster['ids']])
found_cluster['ids'].extend( [c for c in ids if c not in found_cluster['ids']])
else:
self.found.append( {'id':cluster[0]['id'],
'date':cluster[0]['date'],
self.field:cluster[0][self.field],
'link':cluster[0]['link'],
'cluster':cluster[:],
'ids': ids[:] })
#print datetime.datetime.now()-started_time
return self
def run_trgm(self):
started_time = datetime.datetime.now()
self.found = []
if self.db:
cur = self.db.cursor()
sql = "select d1.id, d1.doc_date, d1."+self.field + ", d1.doc_link, " \
+ "d1."+self.field+" <-> d2."+self.field+", " \
+ "d2.id, d2.doc_date, d2."+self.field + ", d2.doc_link " \
+ "from docs d1, docs d2 " \
+ "where d1.id <> d2.id and d1.source_id <> d2.source_id " \
+ "and d1.doc_date >= %s and d1.doc_date <= %s and d2.doc_date >= %s and d2.doc_date <= %s " \
+ "and d1."+self.field + " <-> d2."+self.field+" <= " + str(1-self.threshold) + " " \
+ "order by d1.doc_date, d1.id"
params = (self.start_date, self.end_date, self.start_date, self.end_date)
cur.execute(sql, params)
res = cur.fetchall()
cur.close()
self.db.commit()
#
for i in res:
found_left = False
found_right = False
found_k = []
for k in self.found:
if i[0] in k['ids']:
found_left = True
if i[5] in k['ids']:
found_right = True
if found_left or found_right:
found_k = k
break
if found_left and not found_right:
found_k['cluster'].append({'id':i[5], 'date':i[6], self.field:i[7], 'link':i[8]})
found_k['ids'].append(i[5])
if found_right and not found_left:
found_k['cluster'].append({'id':i[0], 'date':i[1], self.field:i[2], 'link':i[3]})
found_k['ids'].append(i[0])
if (not found_left) and (not found_right):
cluster = []
cluster.append( {'id':i[0], 'date':i[1], self.field:i[2], 'link':i[3]} )
cluster.append( {'id':i[5], 'date':i[6], self.field:i[7], 'link':i[8]} )
self.found.append( {'id':cluster[0]['id'],
'date':cluster[0]['date'],
self.field:cluster[0][self.field],
'link':cluster[0]['link'],
'cluster':cluster[:],
'ids': [i[0], i[5]] })
print datetime.datetime.now()-started_time
return self
#------------------------------------------------------------------