-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 86ca205
Showing
7 changed files
with
284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license | ||
With Love, | ||
-Scott! | ||
gitpushoriginmaster@gmail.com |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import sys | ||
import os | ||
import zlib | ||
|
||
COMPRESSIONLEVEL = 6 | ||
|
||
def NCDkernel(doc1,doc2,doc1_compressed_size = -1, doc2_compressed_size = -1): | ||
|
||
if doc1_compressed_size == -1: | ||
doc1_compressed_size = sys.getsizeof(zlib.compress(doc1, COMPRESSIONLEVEL),-1) | ||
if doc1_compressed_size == -1: | ||
sys.exit('document_error') | ||
|
||
if doc2_compressed_size == -1: | ||
doc2_compressed_size = sys.getsizeof(zlib.compress(doc2, COMPRESSIONLEVEL),-1) | ||
if doc2_compressed_size == -1: | ||
sys.exit('document_error') | ||
|
||
bothdocs_compressed_size = sys.getsizeof(zlib.compress(''.join([doc1,doc2]), COMPRESSIONLEVEL),-1) | ||
if bothdocs_compressed_size == -1: | ||
sys.exit('document_error') | ||
|
||
NCD = 1.0*(bothdocs_compressed_size - min(doc1_compressed_size, doc2_compressed_size))/max(doc1_compressed_size, doc2_compressed_size) | ||
|
||
#print doc1_compressed_size | ||
#print doc2_compressed_size | ||
#print bothdocs_compressed_size | ||
|
||
|
||
return NCD |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This is a program to do unsupervised clustering of files using the NCD. Takes a bunch of files and clusters them with a variant of k-means. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#from svm import * | ||
import sys | ||
import os | ||
import zlib | ||
import NCD | ||
from homeworkmap import buildMap | ||
NCD.COMPRESSIONLEVEL = 6 | ||
|
||
class StudentPair: | ||
def __init__(self, studentx, studenty, distance): | ||
self.studentx = studentx | ||
self.studenty = studenty | ||
self.distance = distance | ||
|
||
def compareStudents(homeworkpath): | ||
studenthomeworkmap = buildMap(homeworkpath) | ||
students = studenthomeworkmap.keys() | ||
#Initialize biggest5 to nothing in particular | ||
biggest5 = [] | ||
for i in range(5): | ||
biggest5.append(StudentPair(None, None, 1)) | ||
|
||
for i in range(len(students)-1): | ||
studentxhomeworkfile = open(studenthomeworkmap[students[i]], 'r') | ||
studentxhomework = studentxhomeworkfile.read() | ||
studentxhomeworkfile.close() | ||
|
||
for j in range(len(students[i+1:])): | ||
j += i + 1 | ||
studentyhomeworkfile = open(studenthomeworkmap[students[j]], 'r') | ||
studentyhomework = studentyhomeworkfile.read() | ||
studentyhomeworkfile.close() | ||
|
||
xydistance = NCD.NCDkernel(studentxhomework, studentyhomework) | ||
|
||
bigenough = False | ||
k = 0 | ||
while k< len(biggest5) and not bigenough: | ||
if xydistance < biggest5[k].distance: | ||
biggest5[k] = StudentPair(students[i], students[j], xydistance) | ||
bigenough = True | ||
k+=1 | ||
|
||
return biggest5 | ||
|
||
|
||
biggest5 = compareStudents(sys.argv[1]) | ||
|
||
for pair in biggest5: | ||
print pair.studentx | ||
print pair.studenty | ||
print pair.distance | ||
print '-------------' | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#from svm import * | ||
import sys | ||
import os | ||
import zlib | ||
import NCD | ||
import kmeans | ||
from homeworkmap import buildMap | ||
import pickle | ||
NCD.COMPRESSIONLEVEL = 6 | ||
|
||
class BasicClusters: | ||
|
||
def __init__(self, homeworkpath): | ||
self.studenthomeworkmap = buildMap(homeworkpath) | ||
self.clusters = self._cluster() | ||
|
||
def __str__(self): | ||
this = "" | ||
for cluster in self.clusters: | ||
this += str(cluster) + ": " + str(self.clusters[cluster]) + "\n-----------------------------------\n" | ||
return this | ||
|
||
|
||
class SimpleClusters(BasicClusters): | ||
|
||
def __init__(self, homeworkpath, clustergap = 0.25): | ||
self.clustergap = clustergap | ||
BasicClusters.__init__(self,homeworkpath) | ||
|
||
def _cluster(self): | ||
students = self.studenthomeworkmap.keys() | ||
#This is where we store the comparisons | ||
clusters = {} | ||
|
||
for student in students: | ||
foundclusters = [] | ||
|
||
studentxhomeworkfile = open(self.studenthomeworkmap[student], 'r') | ||
studentxhomework = studentxhomeworkfile.read() | ||
studentxhomeworkfile.close() | ||
for cluster in clusters: | ||
i = 0 | ||
incluster = False | ||
while i < len(clusters[cluster]) and not incluster: | ||
print student, clusters[cluster][i] | ||
studentyhomeworkfile = open(self.studenthomeworkmap[clusters[cluster][i]], 'r') | ||
studentyhomework = studentyhomeworkfile.read() | ||
studentyhomeworkfile.close() | ||
|
||
xydistance = NCD.NCDkernel(studentxhomework, studentyhomework) | ||
|
||
if xydistance < self.clustergap: | ||
incluster = True | ||
foundclusters.append(cluster) | ||
|
||
i+=1 | ||
|
||
|
||
if not foundclusters: | ||
clusters[student] = [student] | ||
else: | ||
newcluster = [] | ||
for cluster in foundclusters: | ||
newcluster.extend(clusters[cluster]) | ||
del clusters[cluster] | ||
newcluster.append(student) | ||
clusters[student] = newcluster | ||
|
||
return clusters | ||
|
||
|
||
class KMeansClusters(SimpleClusters): | ||
|
||
def __init__(self,homeworkpath,k = 6): | ||
self.k = k | ||
BasicClusters.__init__(self,homeworkpath) | ||
|
||
def _cluster(self): | ||
data = self.studenthomeworkmap.keys() | ||
return kmeans.kmeans(data, self._metric, kmeans.quickguess(data,self.k), self.k) | ||
|
||
def _metric(self,x,y): | ||
x = open(self.studenthomeworkmap[x],'r') | ||
xdata = x.read() | ||
x.close() | ||
y = open(self.studenthomeworkmap[y],'r') | ||
ydata = y.read() | ||
y.close() | ||
return NCD.NCDkernel(xdata,ydata) | ||
|
||
|
||
if __name__ == "__main__": | ||
clustered = KMeansClusters(sys.argv[1],int(sys.argv[2])) | ||
print clustered | ||
pickle.dump(clustered,open("lastcluster.p","wb")) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import os | ||
|
||
#send this a homework directory! | ||
def buildMap(homeworkpath): | ||
studentids = os.listdir(homeworkpath) | ||
studenthomeworkmap = {} | ||
for student in studentids: | ||
submissionspath = os.path.join(homeworkpath,student) | ||
|
||
submissions = None | ||
if os.path.isdir(submissionspath): | ||
submissions = os.listdir(submissionspath) | ||
|
||
if submissions: | ||
latest = max(submissions) | ||
latestpath = os.path.join(submissionspath,latest) | ||
submissionfoldercontents = os.listdir(latestpath) | ||
lastsubmission = None | ||
i = 0 | ||
while i<len(submissionfoldercontents) and not lastsubmission: | ||
if os.path.splitext(submissionfoldercontents[i])[1] == '.py': | ||
lastsubmission = os.path.join(latestpath, submissionfoldercontents[i]) | ||
studenthomeworkmap[student] = lastsubmission | ||
i += 1 | ||
return studenthomeworkmap |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
|
||
#metric should be able to compare two data points | ||
#len(data) should be at least as large as k | ||
#len(guessmeans) should be EXACTLY k | ||
#this version of kmeans uses ONE OF THE ACTUAL DATA points for each mean. The mean is the data point that minimizes the sum of the distances to each of the points in its cluster | ||
def kmeans(data, metric, guessmeans, k = 6): | ||
|
||
#initialize the means | ||
clusters = {} | ||
for i in range(k): | ||
clusters[guessmeans[i]] = [] | ||
|
||
for point in data: | ||
print point | ||
closestmean = None | ||
#In python, strings are bigger than ints or floats always | ||
closestdist = "biggest" | ||
for mean in clusters: | ||
distance = metric(mean, point) | ||
if distance < closestdist: | ||
closestdist = distance | ||
closestmean = mean | ||
clusters[closestmean].append(point) | ||
|
||
newmeans = calcmeans(clusters,metric) | ||
if equalmeans(newmeans,guessmeans): | ||
return clusters | ||
|
||
return kmeans(data,metric,newmeans,k) | ||
|
||
|
||
def calcmeans(clusters,metric): | ||
newmeans = [] | ||
for cluster in clusters: | ||
sumdistances = {} | ||
#init sumdistances to emptiness | ||
for point in clusters[cluster]: | ||
sumdistances[point] = 0 | ||
|
||
i = 0 | ||
for i in range(len(clusters[cluster])): | ||
j = i + 1 | ||
for j in range(len(clusters[cluster])): | ||
print i,j | ||
distance = metric(clusters[cluster][i],clusters[cluster][j]) | ||
sumdistances[clusters[cluster][i]] += distance | ||
sumdistances[clusters[cluster][j]] += distance | ||
|
||
newmeans.append(min(sumdistances.items(),key = lambda x: x[1])[0]) | ||
|
||
return newmeans | ||
|
||
|
||
def equalmeans(newmeans, oldmeans): | ||
if len(newmeans) != len(oldmeans): | ||
return False | ||
|
||
equalflag = True | ||
for i in range(len(newmeans)): | ||
if newmeans[i] not in oldmeans: | ||
equalflag = False | ||
return equalflag | ||
|
||
def quickguess(data, k = 6): | ||
means = [] | ||
for i in range(k): | ||
means.append(data[i]) | ||
return means | ||
|