forked from mbarnes1/KwikCluster
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathKwikCluster.py
165 lines (148 loc) · 6.22 KB
/
KwikCluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import cProfile
from MinHash import MinHash, Banding, JaccardMatchFunction
import sys
import getopt
from numpy import Inf
from numpy import random
from itertools import izip
__author__ = 'Matt Barnes'
def main(argv):
"""
Cluster a plain text file, one document per line with unigram tokens.
:param argv: See below
:return:
"""
number_hash_functions = 200
threshold = 0.9
number_processes = 1
max_lines = Inf
helpline = 'KwikCluster.py -i <inputfile> -o <outputfile> -t <threshold> -f <numberhashfunctions> -c <numberprocesses> -m <maxlines>'
try:
opts, args = getopt.getopt(argv, "h:i:o:t:f:p:m:", ["ifile=", "ofile=", "threshold=", "hashfunctions=", "processes=", "maxlines="])
except getopt.GetoptError:
print helpline
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print helpline
sys.exit()
elif opt in ("-i", "--ifile"):
input_file = arg
elif opt in ("-o", "--ofile"):
output_file = arg
elif opt in ("-t", "--threshold"):
threshold = float(arg)
elif opt in ("-f", "--hashfunctions"):
number_hash_functions = int(arg)
elif opt in ("-p", "--processes"):
number_processes = int(arg)
elif opt in ("-m", "--maxlines"):
max_lines = int(arg)
minhash = MinHash(number_hash_functions)
bands = Banding(number_hash_functions, threshold, number_processes=number_processes)
doc_ids_to_cluster = set()
with open(input_file, 'rb') as ins:
for line_number, line in enumerate(ins):
if line_number % 1000 == 0:
print 'Reading in document ' + str(line_number)
if line_number > max_lines:
break
doc_ids_to_cluster.add(line_number)
tokens = line.split(' ')
minhash.add_document(line_number, tokens)
minhash.finish()
bands.add_signatures(minhash.signatures)
match_function = JaccardMatchFunction(minhash, bands).match_function
clusters = kwik_cluster(match_function, doc_ids_to_cluster)
print 'Finished clustering. Found ', str(len(clusters)), ' clusters'
with open(output_file, 'w') as ins:
for cluster in clusters:
line = ' '.join([str(doc_index) for doc_index in cluster])
ins.write(line + '\n')
def kwik_cluster(match_function, doc_indices):
"""
KwikCluster (Ailon et al. 2008), with edges between any docs with at least one "feature"
:param match_function: Function handle. match_function(pivot_doc_index) returns set of all doc_indices with edge to pivot_doc_index
:param doc_indices: Set of doc indices to cluster
:return clusters: Frozen set of frozen sets, each subset contains doc ids in that cluster
"""
print 'Running KwikCluster on documents...'
clusters = set()
while doc_indices:
if len(clusters) % 100 == 0:
print ' KwikCluster on remaining ' + str(len(doc_indices)) + ' documents'
pivot_index = doc_indices.pop()
cluster = match_function(pivot_index).intersection(doc_indices)
cluster.add(pivot_index)
doc_indices.difference_update(cluster)
clusters.add(frozenset(cluster))
clusters = frozenset(clusters)
print 'Clustered into ' + str(len(clusters)) + ' clusters'
return clusters
class ConsensusClusteringMatchFunction(object):
"""
Probabilistic match function, based on fraction of links across all clusterings
"""
def __init__(self, clusterings):
self.list_id_to_matches = []
for clustering in clusterings:
id_to_matches = dict()
for cluster in clustering:
for doc_id in cluster:
id_to_matches[doc_id] = cluster
self.list_id_to_matches.append(id_to_matches)
def match_function(self, doc_id):
"""
Returns all matching doc ids, probabilistically
:param doc_id:
:return matches: Set of matching doc ids (including doc_id
"""
potential_matches = dict()
for id_to_matches in self.list_id_to_matches:
for potential_match in id_to_matches[doc_id]:
if potential_match in potential_matches:
potential_matches[potential_match] += 1
else:
potential_matches[potential_match] = 1
probs = float(len(self.list_id_to_matches))*random.uniform(size=len(potential_matches))
matches = set([doc_id])
for prob, (potential_match, count) in izip(probs, potential_matches.iteritems()):
if float(count)/float(len(self.list_id_to_matches)) > prob:
matches.add(potential_match)
matches.add(doc_id)
return matches
def consensus_clustering(clusterings):
"""
Consensus Clustering with KwikCluster (Ailon et al. 2008). An 11/7 approximation algorithm, in linear time
:param clusterings: List of clusterings, where each clustering is a frozen set of clusters (each cluster is a frozen set of doc ids)
:return clusters: Frozen set of frozen sets, each subset contains doc ids in that cluster
"""
doc_ids = set()
for clustering in clusterings:
for cluster in clustering:
doc_ids.update(set(cluster))
match_function = ConsensusClusteringMatchFunction(clusterings).match_function
clusters = kwik_cluster(match_function, doc_ids)
return clusters
def clusters_to_labels(clusters):
"""
:param clusters: List of lists, each sublist contains doc ids in that cluster
:return labels: Dict of [doc_id, cluster_label] where cluster_label are assigned from positive ints starting at 1
"""
labels = dict()
for label, cluster in enumerate(clusters):
for doc_id in cluster:
labels[doc_id] = label
return labels
def clean(doc_to_features, feature_to_docs, doc_id):
"""
Removes ID from all traces of bands
:param doc_to_features: Dict mapping [doc, set of features]
:param feature_to_docs: Dict mapping [feature, set of doc_ids]
:param doc_id: Doc ID
"""
features = doc_to_features.pop(doc_id)
for feature in features:
feature_to_docs[feature].remove(doc_id)
if __name__ == '__main__':
cProfile.run('main(sys.argv[1:])')