First commit!

csbrown · Oct 9, 2013 · 86ca205 · 86ca205
commit 86ca205
Show file tree

Hide file tree

Showing 7 changed files with 284 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,4 @@
+   I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license 
+   With Love,
+   -Scott!
+   gitpushoriginmaster@gmail.com
diff --git a/NCD.py b/NCD.py
@@ -0,0 +1,30 @@
+import sys
+import os
+import zlib
+
+COMPRESSIONLEVEL = 6
+
+def NCDkernel(doc1,doc2,doc1_compressed_size = -1, doc2_compressed_size = -1):
+
+  if doc1_compressed_size == -1:
+    doc1_compressed_size = sys.getsizeof(zlib.compress(doc1, COMPRESSIONLEVEL),-1)
+    if doc1_compressed_size == -1:
+      sys.exit('document_error')
+
+  if doc2_compressed_size == -1:
+    doc2_compressed_size = sys.getsizeof(zlib.compress(doc2, COMPRESSIONLEVEL),-1)
+    if doc2_compressed_size == -1:
+      sys.exit('document_error')  
+
+  bothdocs_compressed_size = sys.getsizeof(zlib.compress(''.join([doc1,doc2]), COMPRESSIONLEVEL),-1)
+  if bothdocs_compressed_size == -1:
+    sys.exit('document_error')
+
+  NCD = 1.0*(bothdocs_compressed_size - min(doc1_compressed_size, doc2_compressed_size))/max(doc1_compressed_size, doc2_compressed_size)
+
+  #print doc1_compressed_size
+  #print doc2_compressed_size
+  #print bothdocs_compressed_size
+
+
+  return NCD
diff --git a/README b/README
@@ -0,0 +1 @@
+This is a program to do unsupervised clustering of files using the NCD.  Takes a bunch of files and clusters them with a variant of k-means.
diff --git a/classify.py b/classify.py
@@ -0,0 +1,57 @@
+#from svm import *
+import sys
+import os
+import zlib
+import NCD
+from homeworkmap import buildMap
+NCD.COMPRESSIONLEVEL = 6
+
+class StudentPair:
+	def __init__(self, studentx, studenty, distance):
+		self.studentx = studentx
+		self.studenty = studenty
+		self.distance = distance
+
+def compareStudents(homeworkpath):
+	studenthomeworkmap = buildMap(homeworkpath)
+	students = studenthomeworkmap.keys()
+	#Initialize biggest5 to nothing in particular
+	biggest5 = []
+	for i in range(5):
+		biggest5.append(StudentPair(None, None, 1))
+
+	for i in range(len(students)-1):
+		studentxhomeworkfile = open(studenthomeworkmap[students[i]], 'r')
+		studentxhomework = studentxhomeworkfile.read()
+		studentxhomeworkfile.close()
+
+		for j in range(len(students[i+1:])):
+			j += i + 1
+			studentyhomeworkfile = open(studenthomeworkmap[students[j]], 'r')
+			studentyhomework = studentyhomeworkfile.read()
+			studentyhomeworkfile.close()
+
+			xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)
+
+			bigenough = False
+			k = 0
+			while k< len(biggest5) and not bigenough:
+				if xydistance < biggest5[k].distance:
+					biggest5[k] = StudentPair(students[i], students[j], xydistance)
+					bigenough = True
+				k+=1
+
+	return biggest5					
+
+
+biggest5 = compareStudents(sys.argv[1])
+
+for pair in biggest5:
+	print pair.studentx
+	print pair.studenty
+	print pair.distance
+	print '-------------'
+
+
+
+
diff --git a/cluster.py b/cluster.py
@@ -0,0 +1,98 @@
+#from svm import *
+import sys
+import os
+import zlib
+import NCD
+import kmeans
+from homeworkmap import buildMap
+import pickle
+NCD.COMPRESSIONLEVEL = 6
+
+class BasicClusters:
+
+  def __init__(self, homeworkpath):
+    self.studenthomeworkmap = buildMap(homeworkpath)
+    self.clusters = self._cluster()
+
+  def __str__(self):
+    this = ""
+    for cluster in self.clusters:
+      this += str(cluster) + ": " + str(self.clusters[cluster]) + "\n-----------------------------------\n"
+    return this
+
+
+class SimpleClusters(BasicClusters):
+
+  def __init__(self, homeworkpath, clustergap = 0.25):
+    self.clustergap = clustergap
+    BasicClusters.__init__(self,homeworkpath)
+
+  def _cluster(self):
+    students = self.studenthomeworkmap.keys()
+    #This is where we store the comparisons
+    clusters = {}
+
+    for student in students:
+      foundclusters = []
+
+      studentxhomeworkfile = open(self.studenthomeworkmap[student], 'r')
+      studentxhomework = studentxhomeworkfile.read()
+      studentxhomeworkfile.close()
+      for cluster in clusters:
+        i = 0
+        incluster = False
+        while i < len(clusters[cluster]) and not incluster:
+          print student, clusters[cluster][i]
+          studentyhomeworkfile = open(self.studenthomeworkmap[clusters[cluster][i]], 'r')
+          studentyhomework = studentyhomeworkfile.read()
+          studentyhomeworkfile.close()        
+
+          xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)      
+
+          if xydistance < self.clustergap:
+            incluster = True
+            foundclusters.append(cluster)
+
+          i+=1
+
+
+      if not foundclusters:
+        clusters[student] = [student]
+      else:
+        newcluster = []
+        for cluster in foundclusters:
+          newcluster.extend(clusters[cluster])
+          del clusters[cluster]
+        newcluster.append(student)
+        clusters[student] = newcluster
+
+    return clusters 
+
+
+class KMeansClusters(SimpleClusters):
+
+  def __init__(self,homeworkpath,k = 6):
+    self.k = k
+    BasicClusters.__init__(self,homeworkpath)
+
+  def _cluster(self):
+    data = self.studenthomeworkmap.keys()
+    return kmeans.kmeans(data, self._metric, kmeans.quickguess(data,self.k), self.k)
+
+  def _metric(self,x,y):
+    x = open(self.studenthomeworkmap[x],'r')
+    xdata = x.read()
+    x.close()
+    y = open(self.studenthomeworkmap[y],'r')
+    ydata = y.read()
+    y.close()
+    return NCD.NCDkernel(xdata,ydata)
+
+
+if __name__ == "__main__":
+  clustered = KMeansClusters(sys.argv[1],int(sys.argv[2]))
+  print clustered
+  pickle.dump(clustered,open("lastcluster.p","wb"))
+
+
+
diff --git a/homeworkmap.py b/homeworkmap.py
@@ -0,0 +1,25 @@
+import os
+
+#send this a homework directory!
+def buildMap(homeworkpath):
+	studentids = os.listdir(homeworkpath)
+	studenthomeworkmap = {}
+	for student in studentids:
+		submissionspath = os.path.join(homeworkpath,student)
+
+		submissions = None
+		if os.path.isdir(submissionspath):
+			submissions = os.listdir(submissionspath)
+
+		if submissions:
+			latest = max(submissions)
+			latestpath = os.path.join(submissionspath,latest)
+			submissionfoldercontents = os.listdir(latestpath)
+			lastsubmission = None
+			i = 0
+			while i<len(submissionfoldercontents) and not lastsubmission:
+				if os.path.splitext(submissionfoldercontents[i])[1] == '.py':
+					lastsubmission = os.path.join(latestpath, submissionfoldercontents[i])
+					studenthomeworkmap[student] = lastsubmission
+				i += 1
+	return studenthomeworkmap
diff --git a/kmeans.py b/kmeans.py
@@ -0,0 +1,69 @@
+
+#metric should be able to compare two data points
+#len(data) should be at least as large as k
+#len(guessmeans) should be EXACTLY k
+#this version of kmeans uses ONE OF THE ACTUAL DATA points for each mean.  The mean is the data point that minimizes the sum of the distances to each of the points in its cluster
+def kmeans(data, metric, guessmeans, k = 6):
+
+  #initialize the means
+  clusters = {}
+  for i in range(k):
+    clusters[guessmeans[i]] = []
+
+  for point in data:
+    print point
+    closestmean = None
+    #In python, strings are bigger than ints or floats always
+    closestdist = "biggest"
+    for mean in clusters:
+      distance = metric(mean, point)
+      if distance < closestdist:
+        closestdist = distance
+        closestmean = mean
+    clusters[closestmean].append(point)
+
+  newmeans = calcmeans(clusters,metric)
+  if equalmeans(newmeans,guessmeans):
+    return clusters
+
+  return kmeans(data,metric,newmeans,k)
+
+
+def calcmeans(clusters,metric):
+  newmeans = []
+  for cluster in clusters:
+    sumdistances = {}
+    #init sumdistances to emptiness
+    for point in clusters[cluster]:
+      sumdistances[point] = 0
+
+    i = 0
+    for i in range(len(clusters[cluster])):
+      j = i + 1
+      for j in range(len(clusters[cluster])):
+        print i,j
+        distance = metric(clusters[cluster][i],clusters[cluster][j])
+        sumdistances[clusters[cluster][i]] += distance
+        sumdistances[clusters[cluster][j]] += distance  
+
+    newmeans.append(min(sumdistances.items(),key = lambda x: x[1])[0])
+
+  return newmeans
+
+
+def equalmeans(newmeans, oldmeans):
+  if len(newmeans) != len(oldmeans):
+    return False
+
+  equalflag = True
+  for i in range(len(newmeans)):
+    if newmeans[i] not in oldmeans:
+      equalflag = False
+  return equalflag
+
+def quickguess(data, k = 6):
+  means = []
+  for i in range(k):
+    means.append(data[i])
+  return means
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This is a program to do unsupervised clustering of files using the NCD. Takes a bunch of files and clusters them with a variant of k-means.