From 86ca2053e869a870a65a622a1a94527c584795f8 Mon Sep 17 00:00:00 2001
From: Scott <gitpushoriginmaster@gmail.com>
Date: Wed, 9 Oct 2013 11:25:00 -0400
Subject: [PATCH] First commit!

---
 LICENSE        |  4 +++
 NCD.py         | 30 ++++++++++++++++
 README         |  1 +
 classify.py    | 57 +++++++++++++++++++++++++++++
 cluster.py     | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
 homeworkmap.py | 25 +++++++++++++
 kmeans.py      | 69 +++++++++++++++++++++++++++++++++++
 7 files changed, 284 insertions(+)
 create mode 100644 LICENSE
 create mode 100644 NCD.py
 create mode 100644 README
 create mode 100644 classify.py
 create mode 100644 cluster.py
 create mode 100644 homeworkmap.py
 create mode 100644 kmeans.py

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..130d297
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,4 @@
+   I hereby release this work into the public domain, to the extent allowed by law, per the CC0 1.0 license 
+   With Love,
+   -Scott!
+   gitpushoriginmaster@gmail.com
diff --git a/NCD.py b/NCD.py
new file mode 100644
index 0000000..4623322
--- /dev/null
+++ b/NCD.py
@@ -0,0 +1,30 @@
+import sys
+import os
+import zlib
+
+COMPRESSIONLEVEL = 6
+
+def NCDkernel(doc1,doc2,doc1_compressed_size = -1, doc2_compressed_size = -1):
+
+  if doc1_compressed_size == -1:
+    doc1_compressed_size = sys.getsizeof(zlib.compress(doc1, COMPRESSIONLEVEL),-1)
+    if doc1_compressed_size == -1:
+      sys.exit('document_error')
+      
+  if doc2_compressed_size == -1:
+    doc2_compressed_size = sys.getsizeof(zlib.compress(doc2, COMPRESSIONLEVEL),-1)
+    if doc2_compressed_size == -1:
+      sys.exit('document_error')  
+  
+  bothdocs_compressed_size = sys.getsizeof(zlib.compress(''.join([doc1,doc2]), COMPRESSIONLEVEL),-1)
+  if bothdocs_compressed_size == -1:
+    sys.exit('document_error')
+    
+  NCD = 1.0*(bothdocs_compressed_size - min(doc1_compressed_size, doc2_compressed_size))/max(doc1_compressed_size, doc2_compressed_size)
+  
+  #print doc1_compressed_size
+  #print doc2_compressed_size
+  #print bothdocs_compressed_size
+  
+  
+  return NCD
diff --git a/README b/README
new file mode 100644
index 0000000..997bb6c
--- /dev/null
+++ b/README
@@ -0,0 +1 @@
+This is a program to do unsupervised clustering of files using the NCD.  Takes a bunch of files and clusters them with a variant of k-means.
diff --git a/classify.py b/classify.py
new file mode 100644
index 0000000..df0e7c2
--- /dev/null
+++ b/classify.py
@@ -0,0 +1,57 @@
+#from svm import *
+import sys
+import os
+import zlib
+import NCD
+from homeworkmap import buildMap
+NCD.COMPRESSIONLEVEL = 6
+
+class StudentPair:
+	def __init__(self, studentx, studenty, distance):
+		self.studentx = studentx
+		self.studenty = studenty
+		self.distance = distance
+
+def compareStudents(homeworkpath):
+	studenthomeworkmap = buildMap(homeworkpath)
+	students = studenthomeworkmap.keys()
+	#Initialize biggest5 to nothing in particular
+	biggest5 = []
+	for i in range(5):
+		biggest5.append(StudentPair(None, None, 1))
+
+	for i in range(len(students)-1):
+		studentxhomeworkfile = open(studenthomeworkmap[students[i]], 'r')
+		studentxhomework = studentxhomeworkfile.read()
+		studentxhomeworkfile.close()
+
+		for j in range(len(students[i+1:])):
+			j += i + 1
+			studentyhomeworkfile = open(studenthomeworkmap[students[j]], 'r')
+			studentyhomework = studentyhomeworkfile.read()
+			studentyhomeworkfile.close()
+
+			xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)
+			
+			bigenough = False
+			k = 0
+			while k< len(biggest5) and not bigenough:
+				if xydistance < biggest5[k].distance:
+					biggest5[k] = StudentPair(students[i], students[j], xydistance)
+					bigenough = True
+				k+=1
+
+	return biggest5					
+		
+
+biggest5 = compareStudents(sys.argv[1])
+
+for pair in biggest5:
+	print pair.studentx
+	print pair.studenty
+	print pair.distance
+	print '-------------'
+
+
+      
+    
diff --git a/cluster.py b/cluster.py
new file mode 100644
index 0000000..b22784e
--- /dev/null
+++ b/cluster.py
@@ -0,0 +1,98 @@
+#from svm import *
+import sys
+import os
+import zlib
+import NCD
+import kmeans
+from homeworkmap import buildMap
+import pickle
+NCD.COMPRESSIONLEVEL = 6
+
+class BasicClusters:
+
+  def __init__(self, homeworkpath):
+    self.studenthomeworkmap = buildMap(homeworkpath)
+    self.clusters = self._cluster()
+
+  def __str__(self):
+    this = ""
+    for cluster in self.clusters:
+      this += str(cluster) + ": " + str(self.clusters[cluster]) + "\n-----------------------------------\n"
+    return this
+
+
+class SimpleClusters(BasicClusters):
+
+  def __init__(self, homeworkpath, clustergap = 0.25):
+    self.clustergap = clustergap
+    BasicClusters.__init__(self,homeworkpath)
+
+  def _cluster(self):
+    students = self.studenthomeworkmap.keys()
+    #This is where we store the comparisons
+    clusters = {}
+
+    for student in students:
+      foundclusters = []
+
+      studentxhomeworkfile = open(self.studenthomeworkmap[student], 'r')
+      studentxhomework = studentxhomeworkfile.read()
+      studentxhomeworkfile.close()
+      for cluster in clusters:
+        i = 0
+        incluster = False
+        while i < len(clusters[cluster]) and not incluster:
+          print student, clusters[cluster][i]
+          studentyhomeworkfile = open(self.studenthomeworkmap[clusters[cluster][i]], 'r')
+          studentyhomework = studentyhomeworkfile.read()
+          studentyhomeworkfile.close()        
+          
+          xydistance = NCD.NCDkernel(studentxhomework, studentyhomework)      
+
+          if xydistance < self.clustergap:
+            incluster = True
+            foundclusters.append(cluster)
+
+          i+=1
+
+
+      if not foundclusters:
+        clusters[student] = [student]
+      else:
+        newcluster = []
+        for cluster in foundclusters:
+          newcluster.extend(clusters[cluster])
+          del clusters[cluster]
+        newcluster.append(student)
+        clusters[student] = newcluster
+        
+    return clusters 
+
+
+class KMeansClusters(SimpleClusters):
+  
+  def __init__(self,homeworkpath,k = 6):
+    self.k = k
+    BasicClusters.__init__(self,homeworkpath)
+
+  def _cluster(self):
+    data = self.studenthomeworkmap.keys()
+    return kmeans.kmeans(data, self._metric, kmeans.quickguess(data,self.k), self.k)
+
+  def _metric(self,x,y):
+    x = open(self.studenthomeworkmap[x],'r')
+    xdata = x.read()
+    x.close()
+    y = open(self.studenthomeworkmap[y],'r')
+    ydata = y.read()
+    y.close()
+    return NCD.NCDkernel(xdata,ydata)
+    
+
+if __name__ == "__main__":
+  clustered = KMeansClusters(sys.argv[1],int(sys.argv[2]))
+  print clustered
+  pickle.dump(clustered,open("lastcluster.p","wb"))
+  
+      
+    
diff --git a/homeworkmap.py b/homeworkmap.py
new file mode 100644
index 0000000..3a74358
--- /dev/null
+++ b/homeworkmap.py
@@ -0,0 +1,25 @@
+import os
+
+#send this a homework directory!
+def buildMap(homeworkpath):
+	studentids = os.listdir(homeworkpath)
+	studenthomeworkmap = {}
+	for student in studentids:
+		submissionspath = os.path.join(homeworkpath,student)
+		
+		submissions = None
+		if os.path.isdir(submissionspath):
+			submissions = os.listdir(submissionspath)
+		
+		if submissions:
+			latest = max(submissions)
+			latestpath = os.path.join(submissionspath,latest)
+			submissionfoldercontents = os.listdir(latestpath)
+			lastsubmission = None
+			i = 0
+			while i<len(submissionfoldercontents) and not lastsubmission:
+				if os.path.splitext(submissionfoldercontents[i])[1] == '.py':
+					lastsubmission = os.path.join(latestpath, submissionfoldercontents[i])
+					studenthomeworkmap[student] = lastsubmission
+				i += 1
+	return studenthomeworkmap
diff --git a/kmeans.py b/kmeans.py
new file mode 100644
index 0000000..4804182
--- /dev/null
+++ b/kmeans.py
@@ -0,0 +1,69 @@
+
+#metric should be able to compare two data points
+#len(data) should be at least as large as k
+#len(guessmeans) should be EXACTLY k
+#this version of kmeans uses ONE OF THE ACTUAL DATA points for each mean.  The mean is the data point that minimizes the sum of the distances to each of the points in its cluster
+def kmeans(data, metric, guessmeans, k = 6):
+
+  #initialize the means
+  clusters = {}
+  for i in range(k):
+    clusters[guessmeans[i]] = []
+
+  for point in data:
+    print point
+    closestmean = None
+    #In python, strings are bigger than ints or floats always
+    closestdist = "biggest"
+    for mean in clusters:
+      distance = metric(mean, point)
+      if distance < closestdist:
+        closestdist = distance
+        closestmean = mean
+    clusters[closestmean].append(point)
+
+  newmeans = calcmeans(clusters,metric)
+  if equalmeans(newmeans,guessmeans):
+    return clusters
+
+  return kmeans(data,metric,newmeans,k)
+
+
+def calcmeans(clusters,metric):
+  newmeans = []
+  for cluster in clusters:
+    sumdistances = {}
+    #init sumdistances to emptiness
+    for point in clusters[cluster]:
+      sumdistances[point] = 0
+
+    i = 0
+    for i in range(len(clusters[cluster])):
+      j = i + 1
+      for j in range(len(clusters[cluster])):
+        print i,j
+        distance = metric(clusters[cluster][i],clusters[cluster][j])
+        sumdistances[clusters[cluster][i]] += distance
+        sumdistances[clusters[cluster][j]] += distance  
+
+    newmeans.append(min(sumdistances.items(),key = lambda x: x[1])[0])
+
+  return newmeans
+  
+
+def equalmeans(newmeans, oldmeans):
+  if len(newmeans) != len(oldmeans):
+    return False
+
+  equalflag = True
+  for i in range(len(newmeans)):
+    if newmeans[i] not in oldmeans:
+      equalflag = False
+  return equalflag
+
+def quickguess(data, k = 6):
+  means = []
+  for i in range(k):
+    means.append(data[i])
+  return means
+