Small updates based on code review. Renamed statistical_summary.py to…

… correlations.py
xiliu82 · Aug 13, 2014 · 0b7cec3 · 0b7cec3
1 parent ab48f6e
commit 0b7cec3
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 29 deletions.
diff --git a/.../main/python/mllib/statistical_summary.py → ...les/src/main/python/mllib/correlations.py b/.../main/python/mllib/statistical_summary.py → ...les/src/main/python/mllib/correlations.py
@@ -16,7 +16,7 @@
 #
 
 """
-Statistical summarization using MLlib.
+Correlations using MLlib.
 """
 
 import sys
@@ -29,9 +29,9 @@
 
 if __name__ == "__main__":
     if len(sys.argv) not in [1,2]:
-        print >> sys.stderr, "Usage: statistical_summary (<file>)"
+        print >> sys.stderr, "Usage: correlations (<file>)"
         exit(-1)
-    sc = SparkContext(appName="PythonStatisticalSummary")
+    sc = SparkContext(appName="PythonCorrelations")
     if len(sys.argv) == 2:
         filepath = sys.argv[1]
     else:

diff --git a/examples/src/main/python/mllib/random_and_sampled_rdds.py b/examples/src/main/python/mllib/random_and_sampled_rdds.py
@@ -29,7 +29,7 @@
 
 if __name__ == "__main__":
     if len(sys.argv) not in [1, 2]:
-        print >> sys.stderr, "Usage: logistic_regression <libsvm data file>"
+        print >> sys.stderr, "Usage: random_and_sampled_rdds <libsvm data file>"
         exit(-1)
     if len(sys.argv) == 2:
         datapath = sys.argv[1]
@@ -45,22 +45,23 @@
 
     # Example: RandomRDDGenerators
     normalRDD = RandomRDDGenerators.normalRDD(sc, numExamples)
-    print 'Generated RDD of %d examples sampled from a unit normal distribution' % normalRDD.count()
+    print 'Generated RDD of %d examples sampled from the standard normal distribution'\
+        % normalRDD.count()
     normalVectorRDD = RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
     print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
 
-    print ''
+    print
 
     # Example: RDD.sample() and RDD.takeSample()
-    exactSampleSize = int(numExamples * fraction)
+    expectedSampleSize = int(numExamples * fraction)
     print 'Sampling RDD using fraction %g.  Expected sample size = %d.' \
-        % (fraction, exactSampleSize)
+        % (fraction, expectedSampleSize)
     sampledRDD = normalRDD.sample(withReplacement = True, fraction = fraction)
     print '  RDD.sample(): sample has %d examples' % sampledRDD.count()
-    sampledArray = normalRDD.takeSample(withReplacement = True, num = exactSampleSize)
+    sampledArray = normalRDD.takeSample(withReplacement = True, num = expectedSampleSize)
     print '  RDD.takeSample(): sample has %d examples' % len(sampledArray)
 
-    print ''
+    print
 
     # Example: RDD.sampleByKey()
     examples = MLUtils.loadLibSVMFile(sc, datapath)
@@ -74,8 +75,7 @@
     fractions = {}
     for k in keyCountsA.keys():
         fractions[k] = fraction
-    sampledByKeyRDD = \
-        keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)#, exact = True)
+    sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)
     keyCountsB = sampledByKeyRDD.countByKey()
     sizeB = sum(keyCountsB.values())
     print '  Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \

diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomAndSampledRDDs.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomAndSampledRDDs.scala
@@ -32,30 +32,33 @@ import org.apache.spark.SparkContext._
  * }}}
  * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
  */
-object RandomAndSampledRDDs extends App {
+object RandomAndSampledRDDs {
 
   case class Params(input: String = "data/mllib/sample_binary_classification_data.txt")
 
-  val defaultParams = Params()
+  def main(args: Array[String]) {
+    val defaultParams = Params()
 
-  val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
-    head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
-    opt[String]("input")
-      .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
-      .action((x, c) => c.copy(input = x))
-    note(
-      """
+    val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
+      head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
+      opt[String]("input")
+        .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
+        .action((x, c) => c.copy(input = x))
+      note(
+        """
         |For example, the following command runs this app:
         |
         | bin/spark-submit --class org.apache.spark.examples.mllib.RandomAndSampledRDDs \
         |  examples/target/scala-*/spark-examples-*.jar
-      """.stripMargin)
-  }
+      """.
+          stripMargin)
+    }
 
-  parser.parse(args, defaultParams).map { params =>
-    run(params)
-  } getOrElse {
-    sys.exit(1)
+    parser.parse(args, defaultParams).map { params =>
+      run(params)
+    } getOrElse {
+      sys.exit(1)
+    }
   }
 
   def run(params: Params) {

diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py
@@ -161,12 +161,12 @@ def squared_distance(self, other):
                 j += 1
             return result
 
-    def toDense(self):
+    def toArray(self):
         """
         Returns a copy of this SparseVector as a 1-dimensional NumPy array.
         """
         arr = numpy.zeros(self.size)
-        for i in range(self.indices.size):
+        for i in xrange(self.indices.size):
             arr[self.indices[i]] = self.values[i]
         return arr