Merge branch 'sort' into groupby

avulanov · Aug 20, 2014 · f157fe7 · f157fe7
2 parents b2dc3bf + eb53ca6
commit f157fe7
Show file tree

Hide file tree

Showing 20 changed files with 648 additions and 411 deletions.
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -227,7 +227,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     // Validate spark.executor.extraJavaOptions
     settings.get(executorOptsKey).map { javaOpts =>
       if (javaOpts.contains("-Dspark")) {
-        val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts)'. " +
+        val msg = s"$executorOptsKey is not allowed to set Spark options (was '$javaOpts'). " +
           "Set them directly on a SparkConf or in a properties file when using ./bin/spark-submit."
         throw new Exception(msg)
       }

diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala
@@ -467,7 +467,7 @@ private[spark] class ConnectionManager(
 
           val sendingConnectionOpt = connectionsById.get(remoteConnectionManagerId)
           if (!sendingConnectionOpt.isDefined) {
-            logError("Corresponding SendingConnectionManagerId not found")
+            logError(s"Corresponding SendingConnection to ${remoteConnectionManagerId} not found")
             return
           }
 

diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md
@@ -12,14 +12,16 @@ on the [Amazon Web Services site](http://aws.amazon.com/).
 
 `spark-ec2` is designed to manage multiple named clusters. You can
 launch a new cluster (telling the script its size and giving it a name),
-shutdown an existing cluster, or log into a cluster. Each cluster is
-identified by placing its machines into EC2 security groups whose names
-are derived from the name of the cluster. For example, a cluster named
+shutdown an existing cluster, or log into a cluster. Each cluster
+launches a set of instances, which are tagged with the cluster name,
+and placed into EC2 security groups.  If you don't specify a security
+group, the `spark-ec2` script will create security groups based on the
+cluster name you request. For example, a cluster named
 `test` will contain a master node in a security group called
 `test-master`, and a number of slave nodes in a security group called
-`test-slaves`. The `spark-ec2` script will create these security groups
-for you based on the cluster name you request. You can also use them to
-identify machines belonging to each cluster in the Amazon EC2 Console.
+`test-slaves`.  You can also specify a security group prefix to be used
+in place of the cluster name.  Machines in a cluster can be identified
+by looking for the "Name" tag of the instance in the Amazon EC2 Console.
 
 
 # Before You Start

diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
@@ -9,7 +9,7 @@ filtering, dimensionality reduction, as well as underlying optimization primitiv
 
 * [Data types](mllib-basics.html)
 * [Basic statistics](mllib-stats.html)
-  * data generators  
+  * random data generation  
   * stratified sampling
   * summary statistics
   * hypothesis testing

diff --git a/docs/mllib-stats.md b/docs/mllib-stats.md
@@ -25,7 +25,79 @@ displayTitle: <a href="mllib-guide.html">MLlib</a> - Statistics Functionality
 \newcommand{\zero}{\mathbf{0}}
 \]`
 
-## Data Generators 
+## Random data generation
+
+Random data generation is useful for randomized algorithms, prototyping, and performance testing.
+MLlib supports generating random RDDs with i.i.d. values drawn from a given distribution:
+uniform, standard normal, or Poisson.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+[`RandomRDDs`](api/scala/index.html#org.apache.spark.mllib.random.RandomRDDs) provides factory
+methods to generate random double RDDs or vector RDDs.
+The following example generates a random double RDD, whose values follows the standard normal
+distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+
+{% highlight scala %}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.random.RandomRDDs._
+
+val sc: SparkContext = ...
+
+// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+val u = normalRDD(sc, 1000000L, 10)
+// Apply a transform to get a random double RDD following `N(1, 4)`.
+val v = u.map(x => 1.0 + 2.0 * x)
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+[`RandomRDDs`](api/java/index.html#org.apache.spark.mllib.random.RandomRDDs) provides factory
+methods to generate random double RDDs or vector RDDs.
+The following example generates a random double RDD, whose values follows the standard normal
+distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+
+{% highlight java %}
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.JavaDoubleRDD;
+import static org.apache.spark.mllib.random.RandomRDDs.*;
+
+JavaSparkContext jsc = ...
+
+// Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+// standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+JavaDoubleRDD u = normalJavaRDD(jsc, 1000000L, 10);
+// Apply a transform to get a random double RDD following `N(1, 4)`.
+JavaDoubleRDD v = u.map(
+  new Function<Double, Double>() {
+    public Double call(Double x) {
+      return 1.0 + 2.0 * x;
+    }
+  });
+{% endhighlight %}
+</div>
+
+<div data-lang="python" markdown="1">
+[`RandomRDDs`](api/python/pyspark.mllib.random.RandomRDDs-class.html) provides factory
+methods to generate random double RDDs or vector RDDs.
+The following example generates a random double RDD, whose values follows the standard normal
+distribution `N(0, 1)`, and then map it to `N(1, 4)`.
+
+{% highlight python %}
+from pyspark.mllib.random import RandomRDDs
+
+sc = ... # SparkContext
+
+# Generate a random double RDD that contains 1 million i.i.d. values drawn from the
+# standard normal distribution `N(0, 1)`, evenly distributed in 10 partitions.
+u = RandomRDDs.uniformRDD(sc, 1000000L, 10)
+# Apply a transform to get a random double RDD following `N(1, 4)`.
+v = u.map(lambda x: 1.0 + 2.0 * x)
+{% endhighlight %}
+</div>
+
+</div>
 
 ## Stratified Sampling 
 

diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
@@ -124,7 +124,7 @@ def parse_args():
         help="The SSH user you want to connect as (default: root)")
     parser.add_option(
         "--delete-groups", action="store_true", default=False,
-        help="When destroying a cluster, delete the security groups that were created")
+        help="When destroying a cluster, delete the security groups that were created.")
     parser.add_option(
         "--use-existing-master", action="store_true", default=False,
         help="Launch fresh slaves, but use an existing stopped master if possible")
@@ -138,7 +138,9 @@ def parse_args():
     parser.add_option(
         "--user-data", type="string", default="",
         help="Path to a user-data file (most AMI's interpret this as an initialization script)")
-
+    parser.add_option(
+        "--security-group-prefix", type="string", default=None,
+        help="Use this prefix for the security group rather than the cluster name.")
 
     (opts, args) = parser.parse_args()
     if len(args) != 2:
@@ -285,8 +287,12 @@ def launch_cluster(conn, opts, cluster_name):
             user_data_content = user_data_file.read()
 
     print "Setting up security groups..."
-    master_group = get_or_make_group(conn, cluster_name + "-master")
-    slave_group = get_or_make_group(conn, cluster_name + "-slaves")
+    if opts.security_group_prefix is None:
+        master_group = get_or_make_group(conn, cluster_name + "-master")
+        slave_group = get_or_make_group(conn, cluster_name + "-slaves")
+    else:
+        master_group = get_or_make_group(conn, opts.security_group_prefix + "-master")
+        slave_group = get_or_make_group(conn, opts.security_group_prefix + "-slaves")
     if master_group.rules == []:  # Group was just now created
         master_group.authorize(src_group=master_group)
         master_group.authorize(src_group=slave_group)
@@ -310,12 +316,11 @@ def launch_cluster(conn, opts, cluster_name):
         slave_group.authorize('tcp', 60060, 60060, '0.0.0.0/0')
         slave_group.authorize('tcp', 60075, 60075, '0.0.0.0/0')
 
-    # Check if instances are already running in our groups
+    # Check if instances are already running with the cluster name
     existing_masters, existing_slaves = get_existing_cluster(conn, opts, cluster_name,
                                                              die_on_error=False)
     if existing_slaves or (existing_masters and not opts.use_existing_master):
-        print >> stderr, ("ERROR: There are already instances running in " +
-                          "group %s or %s" % (master_group.name, slave_group.name))
+        print >> stderr, ("ERROR: There are already instances for name: %s " % cluster_name)
         sys.exit(1)
 
     # Figure out Spark AMI
@@ -371,9 +376,13 @@ def launch_cluster(conn, opts, cluster_name):
                 for r in reqs:
                     id_to_req[r.id] = r
                 active_instance_ids = []
+                outstanding_request_ids = []
                 for i in my_req_ids:
-                    if i in id_to_req and id_to_req[i].state == "active":
-                        active_instance_ids.append(id_to_req[i].instance_id)
+                    if i in id_to_req:
+                        if id_to_req[i].state == "active":
+                            active_instance_ids.append(id_to_req[i].instance_id)
+                        else:
+                            outstanding_request_ids.append(i)
                 if len(active_instance_ids) == opts.slaves:
                     print "All %d slaves granted" % opts.slaves
                     reservations = conn.get_all_instances(active_instance_ids)
@@ -382,8 +391,8 @@ def launch_cluster(conn, opts, cluster_name):
                         slave_nodes += r.instances
                     break
                 else:
-                    print "%d of %d slaves granted, waiting longer" % (
-                        len(active_instance_ids), opts.slaves)
+                    print "%d of %d slaves granted, waiting longer for request ids including %s" % (
+                        len(active_instance_ids), opts.slaves, outstanding_request_ids[0:10])
         except:
             print "Canceling spot instance requests"
             conn.cancel_spot_instance_requests(my_req_ids)
@@ -440,14 +449,29 @@ def launch_cluster(conn, opts, cluster_name):
         print "Launched master in %s, regid = %s" % (zone, master_res.id)
 
     # Give the instances descriptive names
+    # TODO: Add retry logic for tagging with name since it's used to identify a cluster.
     for master in master_nodes:
-        master.add_tag(
-            key='Name',
-            value='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id))
+        name = '{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)
+        for i in range(0, 5):
+            try:
+                master.add_tag(key='Name', value=name)
+            except:
+                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
+                if (i == 5):
+                    raise "Error - failed max attempts to add name tag"
+                time.sleep(5)
+
+
     for slave in slave_nodes:
-        slave.add_tag(
-            key='Name',
-            value='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id))
+        name = '{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)
+        for i in range(0, 5):
+            try:
+                slave.add_tag(key='Name', value=name)
+            except:
+                print "Failed attempt %i of 5 to tag %s" % ((i + 1), name)
+                if (i == 5):
+                    raise "Error - failed max attempts to add name tag"
+                time.sleep(5)
 
     # Return all the instances
     return (master_nodes, slave_nodes)
@@ -463,18 +487,18 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
     for res in reservations:
         active = [i for i in res.instances if is_active(i)]
         for inst in active:
-            group_names = [g.name for g in inst.groups]
-            if group_names == [cluster_name + "-master"]:
+            name = inst.tags.get(u'Name', "")
+            if name.startswith(cluster_name + "-master"):
                 master_nodes.append(inst)
-            elif group_names == [cluster_name + "-slaves"]:
+            elif name.startswith(cluster_name + "-slave"):
                 slave_nodes.append(inst)
     if any((master_nodes, slave_nodes)):
         print ("Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes)))
     if master_nodes != [] or not die_on_error:
         return (master_nodes, slave_nodes)
     else:
         if master_nodes == [] and slave_nodes != []:
-            print >> sys.stderr, "ERROR: Could not find master in group " + cluster_name + "-master"
+            print >> sys.stderr, "ERROR: Could not find master in with name " + cluster_name + "-master"
         else:
             print >> sys.stderr, "ERROR: Could not find any existing cluster"
         sys.exit(1)
@@ -816,7 +840,10 @@ def real_main():
             # Delete security groups as well
             if opts.delete_groups:
                 print "Deleting security groups (this will take some time)..."
-                group_names = [cluster_name + "-master", cluster_name + "-slaves"]
+                if opts.security_group_prefix is None:
+                    group_names = [cluster_name + "-master", cluster_name + "-slaves"]
+                else:
+                    group_names = [opts.security_group_prefix + "-master", opts.security_group_prefix + "-slaves"]
 
                 attempt = 1
                 while attempt <= 3:

diff --git a/mllib/pom.xml b/mllib/pom.xml
@@ -91,6 +91,13 @@
       <artifactId>junit-interface</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   <profiles>
     <profile>

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -20,14 +20,14 @@ package org.apache.spark.mllib.random
 import cern.jet.random.Poisson
 import cern.jet.random.engine.DRand
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  * Trait for random data generators that generate i.i.d. data.
  */
-@Experimental
+@DeveloperApi
 trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
 
   /**
@@ -43,10 +43,10 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  * Generates i.i.d. samples from U[0.0, 1.0]
  */
-@Experimental
+@DeveloperApi
 class UniformGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
@@ -62,10 +62,10 @@ class UniformGenerator extends RandomDataGenerator[Double] {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  * Generates i.i.d. samples from the standard normal distribution.
  */
-@Experimental
+@DeveloperApi
 class StandardNormalGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
@@ -81,12 +81,12 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  * Generates i.i.d. samples from the Poisson distribution with the given mean.
  *
  * @param mean mean for the Poisson distribution.
  */
-@Experimental
+@DeveloperApi
 class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
 
   private var rng = new Poisson(mean, new DRand)