alteryx · markhamstra · Nov 17, 2014 · Nov 17, 2014 · Nov 17, 2014 · Nov 18, 2014
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 *.ipr
 *.iml
 *.iws
+*.pyc
 .idea/
 sbt/*.jar
 .settings
@@ -45,7 +46,9 @@ dependency-reduced-pom.xml
 checkpoint
 derby.log
 dist/
-spark-*-bin.tar.gz
+dev/create-release/*txt
+dev/create-release/*final
+spark-*-bin-*.tgz
 unit-tests.log
 /lib/
 rat-results.txt

diff --git a/.rat-excludes b/.rat-excludes
@@ -57,3 +57,5 @@ dist/*
 .*iws
 logs
 .*scalastyle-output.xml
+.*dependency-reduced-pom.xml
+known_translations
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -3,6 +3,71 @@ Spark Change Log
 
 Release 1.1.1
 
+  [SPARK-4480] Avoid many small spills in external data structures (1.1)
+  Andrew Or <andrew@databricks.com>
+  2014-11-19 10:45:42 -0800
+  Commit: 16bf5f3, github.com/apache/spark/pull/3354
+
+  [SPARK-4380] Log more precise number of bytes spilled (1.1)
+  Andrew Or <andrew@databricks.com>
+  2014-11-18 20:15:00 -0800
+  Commit: e22a759, github.com/apache/spark/pull/3355
+
+  [SPARK-4468][SQL] Backports #3334 to branch-1.1
+  Cheng Lian <lian@databricks.com>
+  2014-11-18 17:40:24 -0800
+  Commit: f9739b9, github.com/apache/spark/pull/3338
+
+  [SPARK-4433] fix a racing condition in zipWithIndex
+  Xiangrui Meng <meng@databricks.com>
+  2014-11-18 16:25:44 -0800
+  Commit: ae9b1f6, github.com/apache/spark/pull/3291
+
+  [SPARK-4393] Fix memory leak in ConnectionManager ACK timeout TimerTasks; use HashedWheelTimer (For branch-1.1)
+  Kousuke Saruta <sarutak@oss.nttdata.co.jp>
+  2014-11-18 12:09:18 -0800
+  Commit: 91b5fa8, github.com/apache/spark/pull/3321
+
+  [SPARK-4467] Partial fix for fetch failure in sort-based shuffle (1.1)
+  Andrew Or <andrew@databricks.com>
+  2014-11-17 18:10:49 -0800
+  Commit: aa9ebda, github.com/apache/spark/pull/3330
+
+  Revert "[SPARK-4075] [Deploy] Jar url validation is not enough for Jar file"
+  Andrew Or <andrew@databricks.com>
+  2014-11-17 11:25:38 -0800
+  Commit: b528367
+
+  [branch-1.1][SPARK-4355] OnlineSummarizer doesn't merge mean correctly
+  Xiangrui Meng <meng@databricks.com>
+  2014-11-13 15:36:03 -0800
+  Commit: 4b1c77c, github.com/apache/spark/pull/3251
+
+  [Release] Correct make-distribution.sh log path
+  Andrew Or <andrew@databricks.com>
+  2014-11-12 13:46:26 -0800
+  Commit: ba6d81d
+
+  [Release] Bring audit scripts up-to-date
+  Andrew Or <andrewor14@gmail.com>
+  2014-11-13 00:30:58 +0000
+  Commit: 88bc482
+
+  [Release] Log build output for each distribution
+  Andrew Or <andrew@databricks.com>
+  2014-11-11 18:02:59 -0800
+  Commit: e3a5ee9
+
+  Revert "SPARK-3039: Allow spark to be built using avro-mapred for hadoop2"
+  Andrew Or <andrew@databricks.com>
+  2014-11-12 00:04:30 -0800
+  Commit: 45a01b6
+
+  Update CHANGES.txt
+  Andrew Or <andrewor14@gmail.com>
+  2014-11-11 23:11:32 +0000
+  Commit: 131c626
+
   [SPARK-4295][External]Fix exception in SparkSinkSuite
   maji2014 <maji3@asiainfo.com>
   2014-11-11 02:18:27 -0800

diff --git a/LICENSE b/LICENSE
@@ -363,7 +363,8 @@ THE SOFTWARE.
 
 ========================================================================
 For Scala Interpreter classes (all .scala files in repl/src/main/scala
-except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala):
+except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
+and for SerializableMapWrapper in JavaUtils.scala:
 ========================================================================
 
 Copyright (c) 2002-2013 EPFL

diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.1-candidate-csd-4-SNAPSHOT</version>
+    <version>1.1.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/bagel/pom.xml b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.1-candidate-csd-4-SNAPSHOT</version>
+    <version>1.1.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/core/pom.xml b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>1.1.1-candidate-csd-4-SNAPSHOT</version>
+    <version>1.1.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -18,6 +18,7 @@
 package org.apache.spark
 
 import java.io.{ObjectInputStream, Serializable}
+import java.lang.ThreadLocal
 
 import scala.collection.generic.Growable
 import scala.collection.mutable.Map
@@ -246,10 +247,12 @@ trait AccumulatorParam[T] extends AccumulableParam[T, T] {
 
 // TODO: The multi-thread support in accumulators is kind of lame; check
 // if there's a more intuitive way of doing it right
-private object Accumulators {
+private[spark] object Accumulators {
   // TODO: Use soft references? => need to make readObject work properly then
   val originals = Map[Long, Accumulable[_, _]]()
-  val localAccums = Map[Thread, Map[Long, Accumulable[_, _]]]()
+  val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
+    override protected def initialValue() = Map[Long, Accumulable[_, _]]()
+  }
   var lastId: Long = 0
 
   def newId: Long = synchronized {
@@ -261,22 +264,21 @@ private object Accumulators {
     if (original) {
       originals(a.id) = a
     } else {
-      val accums = localAccums.getOrElseUpdate(Thread.currentThread, Map())
-      accums(a.id) = a
+      localAccums.get()(a.id) = a
     }
   }
 
   // Clear the local (non-original) accumulators for the current thread
   def clear() {
     synchronized {
-      localAccums.remove(Thread.currentThread)
+      localAccums.get.clear
     }
   }
 
   // Get the values of the local accumulators for the current thread (by ID)
   def values: Map[Long, Any] = synchronized {
     val ret = Map[Long, Any]()
-    for ((id, accum) <- localAccums.getOrElse(Thread.currentThread, Map())) {
+    for ((id, accum) <- localAccums.get) {
       ret(id) = accum.localValue
     }
     return ret

diff --git a/core/src/main/scala/org/apache/spark/Partition.scala b/core/src/main/scala/org/apache/spark/Partition.scala
@@ -18,11 +18,11 @@
 package org.apache.spark
 
 /**
- * A partition of an RDD.
+ * An identifier for a partition in an RDD.
  */
 trait Partition extends Serializable {
   /**
-   * Get the split's index within its parent RDD
+   * Get the partition's index within its parent RDD
    */
   def index: Int
 

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -55,14 +55,23 @@ object Partitioner {
    * We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD.
    */
   def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
+    val shortPartitionerNames = Map(
+      "hash" -> "org.apache.spark.HashPartitioner",
+      "byteswap" -> "org.apache.spark.ByteswapPartitioner"
+    )
+    val defaultPartitionerName = rdd.conf.get("spark.default.partitioner", "hash")
+    val className =
+      shortPartitionerNames.getOrElse(defaultPartitionerName.toLowerCase, defaultPartitionerName)
+    val ctor = Class.forName(className, true, Utils.getContextOrSparkClassLoader)
+      .getConstructor(classOf[Int])
     val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
     for (r <- bySize if r.partitioner.isDefined) {
       return r.partitioner.get
     }
     if (rdd.context.conf.contains("spark.default.parallelism")) {
-      new HashPartitioner(rdd.context.defaultParallelism)
+      ctor.newInstance(rdd.context.defaultParallelism: java.lang.Integer).asInstanceOf[Partitioner]
     } else {
-      new HashPartitioner(bySize.head.partitions.size)
+      ctor.newInstance(bySize.head.partitions.size: java.lang.Integer).asInstanceOf[Partitioner]
     }
   }
 }
@@ -93,6 +102,18 @@ class HashPartitioner(partitions: Int) extends Partitioner {
   override def hashCode: Int = numPartitions
 }
 
+/**
+ * A [[org.apache.spark.Partitioner]] that implements hash-based partitioning using
+ * Java's `Object.hashCode`.  In order to spread-out hashCodes that are divisible by
+ * `numPartitions`, `byteswap32` is applied to the hashCodes before modding by `numPartitions`.
+ */
+class ByteswapPartitioner(partitions: Int) extends HashPartitioner(partitions) {
+  override def getPartition(key: Any): Int = key match {
+    case null => 0
+    case _ => Utils.nonNegativeMod(byteswap32(key.hashCode), numPartitions)
+  }
+}
+
 /**
  * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly
  * equal ranges. The ranges are determined by sampling the content of the RDD passed in.

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -324,8 +324,13 @@ class SparkContext(config: SparkConf) extends Logging {
   try {
     dagScheduler = new DAGScheduler(this)
   } catch {
-    case e: Exception => throw
-      new SparkException("DAGScheduler cannot be initialized due to %s".format(e.getMessage))
+    case e: Exception => {
+      try {
+        stop()
+      } finally {
+        throw new SparkException("Error while constructing DAGScheduler", e)
+      }
+    }
   }
 
   // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's
@@ -1334,7 +1339,7 @@ class SparkContext(config: SparkConf) extends Logging {
  */
 object SparkContext extends Logging {
 
-  private[spark] val SPARK_VERSION = "1.1.1"
+  private[spark] val SPARK_VERSION = "1.1.2-SNAPSHOT"
 
   private[spark] val SPARK_JOB_DESCRIPTION = "spark.job.description"
 

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -318,7 +318,7 @@ object SparkEnv extends Logging {
     val sparkProperties = (conf.getAll ++ schedulerMode).sorted
 
     // System properties that are not java classpaths
-    val systemProperties = System.getProperties.iterator.toSeq
+    val systemProperties = Utils.getSystemProperties.toSeq
     val otherProperties = systemProperties.filter { case (k, _) =>
       k != "java.class.path" && !k.startsWith("spark.")
     }.sorted

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -210,8 +210,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements
    * mapping to that key.
    */
-  def groupBy[K](f: JFunction[T, K]): JavaPairRDD[K, JIterable[T]] = {
-    implicit val ctagK: ClassTag[K] = fakeClassTag
+  def groupBy[U](f: JFunction[T, U]): JavaPairRDD[U, JIterable[T]] = {
+    // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459
+    implicit val ctagK: ClassTag[U] = fakeClassTag
     implicit val ctagV: ClassTag[JList[T]] = fakeClassTag
     JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f)(fakeClassTag)))
   }
@@ -220,10 +221,11 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements
    * mapping to that key.
    */
-  def groupBy[K](f: JFunction[T, K], numPartitions: Int): JavaPairRDD[K, JIterable[T]] = {
-    implicit val ctagK: ClassTag[K] = fakeClassTag
+  def groupBy[U](f: JFunction[T, U], numPartitions: Int): JavaPairRDD[U, JIterable[T]] = {
+    // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459
+    implicit val ctagK: ClassTag[U] = fakeClassTag
     implicit val ctagV: ClassTag[JList[T]] = fakeClassTag
-    JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(fakeClassTag[K])))
+    JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(fakeClassTag[U])))
   }
 
   /**
@@ -458,8 +460,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   /**
    * Creates tuples of the elements in this RDD by applying `f`.
    */
-  def keyBy[K](f: JFunction[T, K]): JavaPairRDD[K, T] = {
-    implicit val ctag: ClassTag[K] = fakeClassTag
+  def keyBy[U](f: JFunction[T, U]): JavaPairRDD[U, T] = {
+    // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459
+    implicit val ctag: ClassTag[U] = fakeClassTag
     JavaPairRDD.fromRDD(rdd.keyBy(f))
   }
 

diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala
@@ -19,7 +19,8 @@ package org.apache.spark.api.java
 
 import com.google.common.base.Optional
 
-import scala.collection.convert.Wrappers.MapWrapper
+import java.{util => ju}
+import scala.collection.mutable
 
 private[spark] object JavaUtils {
   def optionToOptional[T](option: Option[T]): Optional[T] =
@@ -32,7 +33,64 @@ private[spark] object JavaUtils {
   def mapAsSerializableJavaMap[A, B](underlying: collection.Map[A, B]) =
     new SerializableMapWrapper(underlying)
 
+  // Implementation is copied from scala.collection.convert.Wrappers.MapWrapper,
+  // but implements java.io.Serializable. It can't just be subclassed to make it
+  // Serializable since the MapWrapper class has no no-arg constructor. This class
+  // doesn't need a no-arg constructor though.
   class SerializableMapWrapper[A, B](underlying: collection.Map[A, B])
-    extends MapWrapper(underlying) with java.io.Serializable
+    extends ju.AbstractMap[A, B] with java.io.Serializable { self =>
 
+    override def size = underlying.size
+
+    override def get(key: AnyRef): B = try {
+      underlying get key.asInstanceOf[A] match {
+        case None => null.asInstanceOf[B]
+        case Some(v) => v
+      }
+    } catch {
+      case ex: ClassCastException => null.asInstanceOf[B]
+    }
+
+    override def entrySet: ju.Set[ju.Map.Entry[A, B]] = new ju.AbstractSet[ju.Map.Entry[A, B]] {
+      def size = self.size
+
+      def iterator = new ju.Iterator[ju.Map.Entry[A, B]] {
+        val ui = underlying.iterator
+        var prev : Option[A] = None
+
+        def hasNext = ui.hasNext
+
+        def next() = {
+          val (k, v) = ui.next
+          prev = Some(k)
+          new ju.Map.Entry[A, B] {
+            import scala.util.hashing.byteswap32
+            def getKey = k
+            def getValue = v
+            def setValue(v1 : B) = self.put(k, v1)
+            override def hashCode = byteswap32(k.hashCode) + (byteswap32(v.hashCode) << 16)
+            override def equals(other: Any) = other match {
+              case e: ju.Map.Entry[_, _] => k == e.getKey && v == e.getValue
+              case _ => false
+            }
+          }
+        }
+
+        def remove() {
+          prev match {
+            case Some(k) =>
+              underlying match {
+                case mm: mutable.Map[a, _] =>
+                  mm remove k
+                  prev = None
+                case _ =>
+                  throw new UnsupportedOperationException("remove")
+              }
+            case _ =>
+              throw new IllegalStateException("next must be called at least once before remove")
+          }
+        }
+      }
+    }
+  }
 }