Skip to content

Commit

Permalink
First commit!
Browse files Browse the repository at this point in the history
  • Loading branch information
csbrown committed Oct 9, 2013
0 parents commit 86ca205
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 0 deletions.
4 changes: 4 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license
With Love,
-Scott!
gitpushoriginmaster@gmail.com
30 changes: 30 additions & 0 deletions NCD.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import sys
import os
import zlib

COMPRESSIONLEVEL = 6

def NCDkernel(doc1,doc2,doc1_compressed_size = -1, doc2_compressed_size = -1):

if doc1_compressed_size == -1:
doc1_compressed_size = sys.getsizeof(zlib.compress(doc1, COMPRESSIONLEVEL),-1)
if doc1_compressed_size == -1:
sys.exit('document_error')

if doc2_compressed_size == -1:
doc2_compressed_size = sys.getsizeof(zlib.compress(doc2, COMPRESSIONLEVEL),-1)
if doc2_compressed_size == -1:
sys.exit('document_error')

bothdocs_compressed_size = sys.getsizeof(zlib.compress(''.join([doc1,doc2]), COMPRESSIONLEVEL),-1)
if bothdocs_compressed_size == -1:
sys.exit('document_error')

NCD = 1.0*(bothdocs_compressed_size - min(doc1_compressed_size, doc2_compressed_size))/max(doc1_compressed_size, doc2_compressed_size)

#print doc1_compressed_size
#print doc2_compressed_size
#print bothdocs_compressed_size


return NCD
1 change: 1 addition & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a program to do unsupervised clustering of files using the NCD. Takes a bunch of files and clusters them with a variant of k-means.
57 changes: 57 additions & 0 deletions classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#from svm import *
import sys
import os
import zlib
import NCD
from homeworkmap import buildMap
NCD.COMPRESSIONLEVEL = 6

class StudentPair:
def __init__(self, studentx, studenty, distance):
self.studentx = studentx
self.studenty = studenty
self.distance = distance

def compareStudents(homeworkpath):
studenthomeworkmap = buildMap(homeworkpath)
students = studenthomeworkmap.keys()
#Initialize biggest5 to nothing in particular
biggest5 = []
for i in range(5):
biggest5.append(StudentPair(None, None, 1))

for i in range(len(students)-1):
studentxhomeworkfile = open(studenthomeworkmap[students[i]], 'r')
studentxhomework = studentxhomeworkfile.read()
studentxhomeworkfile.close()

for j in range(len(students[i+1:])):
j += i + 1
studentyhomeworkfile = open(studenthomeworkmap[students[j]], 'r')
studentyhomework = studentyhomeworkfile.read()
studentyhomeworkfile.close()

xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)

bigenough = False
k = 0
while k< len(biggest5) and not bigenough:
if xydistance < biggest5[k].distance:
biggest5[k] = StudentPair(students[i], students[j], xydistance)
bigenough = True
k+=1

return biggest5


biggest5 = compareStudents(sys.argv[1])

for pair in biggest5:
print pair.studentx
print pair.studenty
print pair.distance
print '-------------'




98 changes: 98 additions & 0 deletions cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#from svm import *
import sys
import os
import zlib
import NCD
import kmeans
from homeworkmap import buildMap
import pickle
NCD.COMPRESSIONLEVEL = 6

class BasicClusters:

def __init__(self, homeworkpath):
self.studenthomeworkmap = buildMap(homeworkpath)
self.clusters = self._cluster()

def __str__(self):
this = ""
for cluster in self.clusters:
this += str(cluster) + ": " + str(self.clusters[cluster]) + "\n-----------------------------------\n"
return this


class SimpleClusters(BasicClusters):

def __init__(self, homeworkpath, clustergap = 0.25):
self.clustergap = clustergap
BasicClusters.__init__(self,homeworkpath)

def _cluster(self):
students = self.studenthomeworkmap.keys()
#This is where we store the comparisons
clusters = {}

for student in students:
foundclusters = []

studentxhomeworkfile = open(self.studenthomeworkmap[student], 'r')
studentxhomework = studentxhomeworkfile.read()
studentxhomeworkfile.close()
for cluster in clusters:
i = 0
incluster = False
while i < len(clusters[cluster]) and not incluster:
print student, clusters[cluster][i]
studentyhomeworkfile = open(self.studenthomeworkmap[clusters[cluster][i]], 'r')
studentyhomework = studentyhomeworkfile.read()
studentyhomeworkfile.close()

xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)

if xydistance < self.clustergap:
incluster = True
foundclusters.append(cluster)

i+=1


if not foundclusters:
clusters[student] = [student]
else:
newcluster = []
for cluster in foundclusters:
newcluster.extend(clusters[cluster])
del clusters[cluster]
newcluster.append(student)
clusters[student] = newcluster

return clusters


class KMeansClusters(SimpleClusters):

def __init__(self,homeworkpath,k = 6):
self.k = k
BasicClusters.__init__(self,homeworkpath)

def _cluster(self):
data = self.studenthomeworkmap.keys()
return kmeans.kmeans(data, self._metric, kmeans.quickguess(data,self.k), self.k)

def _metric(self,x,y):
x = open(self.studenthomeworkmap[x],'r')
xdata = x.read()
x.close()
y = open(self.studenthomeworkmap[y],'r')
ydata = y.read()
y.close()
return NCD.NCDkernel(xdata,ydata)


if __name__ == "__main__":
clustered = KMeansClusters(sys.argv[1],int(sys.argv[2]))
print clustered
pickle.dump(clustered,open("lastcluster.p","wb"))



25 changes: 25 additions & 0 deletions homeworkmap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os

#send this a homework directory!
def buildMap(homeworkpath):
studentids = os.listdir(homeworkpath)
studenthomeworkmap = {}
for student in studentids:
submissionspath = os.path.join(homeworkpath,student)

submissions = None
if os.path.isdir(submissionspath):
submissions = os.listdir(submissionspath)

if submissions:
latest = max(submissions)
latestpath = os.path.join(submissionspath,latest)
submissionfoldercontents = os.listdir(latestpath)
lastsubmission = None
i = 0
while i<len(submissionfoldercontents) and not lastsubmission:
if os.path.splitext(submissionfoldercontents[i])[1] == '.py':
lastsubmission = os.path.join(latestpath, submissionfoldercontents[i])
studenthomeworkmap[student] = lastsubmission
i += 1
return studenthomeworkmap
69 changes: 69 additions & 0 deletions kmeans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

#metric should be able to compare two data points
#len(data) should be at least as large as k
#len(guessmeans) should be EXACTLY k
#this version of kmeans uses ONE OF THE ACTUAL DATA points for each mean. The mean is the data point that minimizes the sum of the distances to each of the points in its cluster
def kmeans(data, metric, guessmeans, k = 6):

#initialize the means
clusters = {}
for i in range(k):
clusters[guessmeans[i]] = []

for point in data:
print point
closestmean = None
#In python, strings are bigger than ints or floats always
closestdist = "biggest"
for mean in clusters:
distance = metric(mean, point)
if distance < closestdist:
closestdist = distance
closestmean = mean
clusters[closestmean].append(point)

newmeans = calcmeans(clusters,metric)
if equalmeans(newmeans,guessmeans):
return clusters

return kmeans(data,metric,newmeans,k)


def calcmeans(clusters,metric):
newmeans = []
for cluster in clusters:
sumdistances = {}
#init sumdistances to emptiness
for point in clusters[cluster]:
sumdistances[point] = 0

i = 0
for i in range(len(clusters[cluster])):
j = i + 1
for j in range(len(clusters[cluster])):
print i,j
distance = metric(clusters[cluster][i],clusters[cluster][j])
sumdistances[clusters[cluster][i]] += distance
sumdistances[clusters[cluster][j]] += distance

newmeans.append(min(sumdistances.items(),key = lambda x: x[1])[0])

return newmeans


def equalmeans(newmeans, oldmeans):
if len(newmeans) != len(oldmeans):
return False

equalflag = True
for i in range(len(newmeans)):
if newmeans[i] not in oldmeans:
equalflag = False
return equalflag

def quickguess(data, k = 6):
means = []
for i in range(k):
means.append(data[i])
return means

0 comments on commit 86ca205

Please sign in to comment.