From 86ca2053e869a870a65a622a1a94527c584795f8 Mon Sep 17 00:00:00 2001 From: Scott Date: Wed, 9 Oct 2013 11:25:00 -0400 Subject: [PATCH] First commit! --- LICENSE | 4 +++ NCD.py | 30 ++++++++++++++++ README | 1 + classify.py | 57 +++++++++++++++++++++++++++++ cluster.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++ homeworkmap.py | 25 +++++++++++++ kmeans.py | 69 +++++++++++++++++++++++++++++++++++ 7 files changed, 284 insertions(+) create mode 100644 LICENSE create mode 100644 NCD.py create mode 100644 README create mode 100644 classify.py create mode 100644 cluster.py create mode 100644 homeworkmap.py create mode 100644 kmeans.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..130d297 --- /dev/null +++ b/LICENSE @@ -0,0 +1,4 @@ + I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license + With Love, + -Scott! + gitpushoriginmaster@gmail.com diff --git a/NCD.py b/NCD.py new file mode 100644 index 0000000..4623322 --- /dev/null +++ b/NCD.py @@ -0,0 +1,30 @@ +import sys +import os +import zlib + +COMPRESSIONLEVEL = 6 + +def NCDkernel(doc1,doc2,doc1_compressed_size = -1, doc2_compressed_size = -1): + + if doc1_compressed_size == -1: + doc1_compressed_size = sys.getsizeof(zlib.compress(doc1, COMPRESSIONLEVEL),-1) + if doc1_compressed_size == -1: + sys.exit('document_error') + + if doc2_compressed_size == -1: + doc2_compressed_size = sys.getsizeof(zlib.compress(doc2, COMPRESSIONLEVEL),-1) + if doc2_compressed_size == -1: + sys.exit('document_error') + + bothdocs_compressed_size = sys.getsizeof(zlib.compress(''.join([doc1,doc2]), COMPRESSIONLEVEL),-1) + if bothdocs_compressed_size == -1: + sys.exit('document_error') + + NCD = 1.0*(bothdocs_compressed_size - min(doc1_compressed_size, doc2_compressed_size))/max(doc1_compressed_size, doc2_compressed_size) + + #print doc1_compressed_size + #print doc2_compressed_size + #print bothdocs_compressed_size + + + return NCD diff --git a/README b/README new file mode 100644 index 0000000..997bb6c --- /dev/null +++ b/README @@ -0,0 +1 @@ +This is a program to do unsupervised clustering of files using the NCD. Takes a bunch of files and clusters them with a variant of k-means. diff --git a/classify.py b/classify.py new file mode 100644 index 0000000..df0e7c2 --- /dev/null +++ b/classify.py @@ -0,0 +1,57 @@ +#from svm import * +import sys +import os +import zlib +import NCD +from homeworkmap import buildMap +NCD.COMPRESSIONLEVEL = 6 + +class StudentPair: + def __init__(self, studentx, studenty, distance): + self.studentx = studentx + self.studenty = studenty + self.distance = distance + +def compareStudents(homeworkpath): + studenthomeworkmap = buildMap(homeworkpath) + students = studenthomeworkmap.keys() + #Initialize biggest5 to nothing in particular + biggest5 = [] + for i in range(5): + biggest5.append(StudentPair(None, None, 1)) + + for i in range(len(students)-1): + studentxhomeworkfile = open(studenthomeworkmap[students[i]], 'r') + studentxhomework = studentxhomeworkfile.read() + studentxhomeworkfile.close() + + for j in range(len(students[i+1:])): + j += i + 1 + studentyhomeworkfile = open(studenthomeworkmap[students[j]], 'r') + studentyhomework = studentyhomeworkfile.read() + studentyhomeworkfile.close() + + xydistance = NCD.NCDkernel(studentxhomework, studentyhomework) + + bigenough = False + k = 0 + while k< len(biggest5) and not bigenough: + if xydistance < biggest5[k].distance: + biggest5[k] = StudentPair(students[i], students[j], xydistance) + bigenough = True + k+=1 + + return biggest5 + + +biggest5 = compareStudents(sys.argv[1]) + +for pair in biggest5: + print pair.studentx + print pair.studenty + print pair.distance + print '-------------' + + + + diff --git a/cluster.py b/cluster.py new file mode 100644 index 0000000..b22784e --- /dev/null +++ b/cluster.py @@ -0,0 +1,98 @@ +#from svm import * +import sys +import os +import zlib +import NCD +import kmeans +from homeworkmap import buildMap +import pickle +NCD.COMPRESSIONLEVEL = 6 + +class BasicClusters: + + def __init__(self, homeworkpath): + self.studenthomeworkmap = buildMap(homeworkpath) + self.clusters = self._cluster() + + def __str__(self): + this = "" + for cluster in self.clusters: + this += str(cluster) + ": " + str(self.clusters[cluster]) + "\n-----------------------------------\n" + return this + + +class SimpleClusters(BasicClusters): + + def __init__(self, homeworkpath, clustergap = 0.25): + self.clustergap = clustergap + BasicClusters.__init__(self,homeworkpath) + + def _cluster(self): + students = self.studenthomeworkmap.keys() + #This is where we store the comparisons + clusters = {} + + for student in students: + foundclusters = [] + + studentxhomeworkfile = open(self.studenthomeworkmap[student], 'r') + studentxhomework = studentxhomeworkfile.read() + studentxhomeworkfile.close() + for cluster in clusters: + i = 0 + incluster = False + while i < len(clusters[cluster]) and not incluster: + print student, clusters[cluster][i] + studentyhomeworkfile = open(self.studenthomeworkmap[clusters[cluster][i]], 'r') + studentyhomework = studentyhomeworkfile.read() + studentyhomeworkfile.close() + + xydistance = NCD.NCDkernel(studentxhomework, studentyhomework) + + if xydistance < self.clustergap: + incluster = True + foundclusters.append(cluster) + + i+=1 + + + if not foundclusters: + clusters[student] = [student] + else: + newcluster = [] + for cluster in foundclusters: + newcluster.extend(clusters[cluster]) + del clusters[cluster] + newcluster.append(student) + clusters[student] = newcluster + + return clusters + + +class KMeansClusters(SimpleClusters): + + def __init__(self,homeworkpath,k = 6): + self.k = k + BasicClusters.__init__(self,homeworkpath) + + def _cluster(self): + data = self.studenthomeworkmap.keys() + return kmeans.kmeans(data, self._metric, kmeans.quickguess(data,self.k), self.k) + + def _metric(self,x,y): + x = open(self.studenthomeworkmap[x],'r') + xdata = x.read() + x.close() + y = open(self.studenthomeworkmap[y],'r') + ydata = y.read() + y.close() + return NCD.NCDkernel(xdata,ydata) + + +if __name__ == "__main__": + clustered = KMeansClusters(sys.argv[1],int(sys.argv[2])) + print clustered + pickle.dump(clustered,open("lastcluster.p","wb")) + + + diff --git a/homeworkmap.py b/homeworkmap.py new file mode 100644 index 0000000..3a74358 --- /dev/null +++ b/homeworkmap.py @@ -0,0 +1,25 @@ +import os + +#send this a homework directory! +def buildMap(homeworkpath): + studentids = os.listdir(homeworkpath) + studenthomeworkmap = {} + for student in studentids: + submissionspath = os.path.join(homeworkpath,student) + + submissions = None + if os.path.isdir(submissionspath): + submissions = os.listdir(submissionspath) + + if submissions: + latest = max(submissions) + latestpath = os.path.join(submissionspath,latest) + submissionfoldercontents = os.listdir(latestpath) + lastsubmission = None + i = 0 + while i