From b528367d75bde36004b551bb149e024b56a3ba57 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 17 Nov 2014 11:25:38 -0800 Subject: [PATCH 01/72] Revert "[SPARK-4075] [Deploy] Jar url validation is not enough for Jar file" This reverts commit 098f83c7ccd7dad9f9228596da69fe5f55711a52. --- .../org/apache/spark/deploy/ClientArguments.scala | 11 +---------- .../scala/org/apache/spark/deploy/ClientSuite.scala | 6 ------ 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala index 4e802e02c4149..39150deab863c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala @@ -17,8 +17,6 @@ package org.apache.spark.deploy -import java.net.{URI, URISyntaxException} - import scala.collection.mutable.ListBuffer import org.apache.log4j.Level @@ -116,12 +114,5 @@ private[spark] class ClientArguments(args: Array[String]) { } object ClientArguments { - def isValidJarUrl(s: String): Boolean = { - try { - val uri = new URI(s) - uri.getScheme != null && uri.getAuthority != null && s.endsWith("jar") - } catch { - case _: URISyntaxException => false - } - } + def isValidJarUrl(s: String): Boolean = s.matches("(.+):(.+)jar") } diff --git a/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala index 94a2bdd74e744..4161aede1d1d0 100644 --- a/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala @@ -29,12 +29,6 @@ class ClientSuite extends FunSuite with Matchers { ClientArguments.isValidJarUrl("hdfs://someHost:1234/foo") should be (false) ClientArguments.isValidJarUrl("/missing/a/protocol/jarfile.jar") should be (false) ClientArguments.isValidJarUrl("not-even-a-path.jar") should be (false) - - // No authority - ClientArguments.isValidJarUrl("hdfs:someHost:1234/jarfile.jar") should be (false) - - // Invalid syntax - ClientArguments.isValidJarUrl("hdfs:") should be (false) } } From cf8d0ef72a2fa678c7677cb0861d4653f4026d98 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 17 Nov 2014 11:49:33 -0800 Subject: [PATCH 02/72] Revert "[maven-release-plugin] prepare for next development iteration" This reverts commit 685bdd2b7e584c84e7d39e40de2d5f30c5388cb5. --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 5e931b7457210..e79d29b999f19 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index 83faf29de577f..f1c2b2171e010 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 664e52f761d32..96effdb711165 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index 4d9da0191face..aa748c6de0f12 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 692f87b1cb0ec..c443eaa238407 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 5d0f7ffc5390a..6d06a2da640f3 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index b267c4757623c..09602f672516b 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index c7fbf4b95500e..462079cbf2eb8 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 36afc14c64fc5..c1628831bf258 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 5ba0fb1a4a7f3..7670646bc6ea5 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index a872bf2327fc6..2433818b07be0 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index 430a5e7cbfde0..b6f4875f9d7ca 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index b3c843a5f3c4c..cb3d09038e4da 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index f228212091b44..9443f6118b3bb 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/pom.xml b/pom.xml index d185ccccf3686..7310884780b07 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 pom Spark Project Parent POM http://spark.apache.org/ @@ -40,7 +40,7 @@ scm:git:git@github.com:apache/spark.git scm:git:https://git-wip-us.apache.org/repos/asf/spark.git scm:git:git@github.com:apache/spark.git - HEAD + v1.1.1-rc1 diff --git a/repl/pom.xml b/repl/pom.xml index ef8d40d84285a..b0b9196bc7b86 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 909dc94230071..19ca3044f987e 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index f6a5f955cdd64..148a521d6e5e0 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index b7540f02ce8ea..c3c2f552f25e1 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 241e305cad986..ff01bc6dc7f8c 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 09c403f08b16a..9b56e32764a5f 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index 09894f4eb07ea..a2bb5af01dcd3 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index bf298998e559f..2fad21bb08505 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index c31eb0faf4368..74a30ae8a7af9 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 1.1.2-SNAPSHOT + 1.1.1 ../pom.xml From e4f5695ec4240a57e38565906851b03f98898175 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 17 Nov 2014 11:49:48 -0800 Subject: [PATCH 03/72] Revert "[maven-release-plugin] prepare release v1.1.1-rc1" This reverts commit 72a4fdbe82203b962fe776d0edaed7f56898cb02. --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index e79d29b999f19..0531fb7b37268 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index f1c2b2171e010..f29540b239c73 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 96effdb711165..0b9486fac522a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index aa748c6de0f12..6124cf8552c14 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index c443eaa238407..17d0fe233873f 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 6d06a2da640f3..402af353152d8 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 09602f672516b..5123d0554639c 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 462079cbf2eb8..9c00bfc8429a4 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index c1628831bf258..1b9ef4af0c2ed 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 7670646bc6ea5..60292a2683212 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 2433818b07be0..58b995c5e7005 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index b6f4875f9d7ca..02c9676fb086a 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index cb3d09038e4da..656478583fac2 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 9443f6118b3bb..74f528f030987 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 7310884780b07..97df626fbab40 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ @@ -40,7 +40,7 @@ scm:git:git@github.com:apache/spark.git scm:git:https://git-wip-us.apache.org/repos/asf/spark.git scm:git:git@github.com:apache/spark.git - v1.1.1-rc1 + HEAD diff --git a/repl/pom.xml b/repl/pom.xml index b0b9196bc7b86..8748ada36f57a 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 19ca3044f987e..e2356381c07fb 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 148a521d6e5e0..3efea9ab8b247 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index c3c2f552f25e1..c264ff4ec92e5 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index ff01bc6dc7f8c..1e689e6d6dcf2 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 9b56e32764a5f..c0ce0d7c7478d 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index a2bb5af01dcd3..c601fd5fbbee2 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 2fad21bb08505..18f27b827ff1a 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index 74a30ae8a7af9..2ba3baf0e3b2e 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 1.1.1 + 1.1.1-SNAPSHOT ../pom.xml From aa9ebdaa28bebccc8f65a323d7c6fc34cf68ef73 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 17 Nov 2014 18:10:49 -0800 Subject: [PATCH 04/72] [SPARK-4467] Partial fix for fetch failure in sort-based shuffle (1.1) This is the 1.1 version of #3302. There has been some refactoring in master so we can't cherry-pick that PR. Author: Andrew Or Closes #3330 from andrewor14/sort-fetch-fail and squashes the following commits: 486fc49 [Andrew Or] Reset `elementsRead` --- .../scala/org/apache/spark/util/collection/ExternalSorter.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index 3136306a3adb0..97ddd96c98268 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -288,6 +288,7 @@ private[spark] class ExternalSorter[K, V, C]( myMemoryThreshold = 0 _memoryBytesSpilled += memorySize + elementsRead = 0 } /** From 91b5fa82477e5fd43712fdf067d92a31d4037a83 Mon Sep 17 00:00:00 2001 From: Kousuke Saruta Date: Tue, 18 Nov 2014 12:09:18 -0800 Subject: [PATCH 05/72] [SPARK-4393] Fix memory leak in ConnectionManager ACK timeout TimerTasks; use HashedWheelTimer (For branch-1.1) This patch is intended to fix a subtle memory leak in ConnectionManager's ACK timeout TimerTasks: in the old code, each TimerTask held a reference to the message being sent and a cancelled TimerTask won't necessarily be garbage-collected until it's scheduled to run, so this caused huge buildups of messages that weren't garbage collected until their timeouts expired, leading to OOMs. This patch addresses this problem by capturing only the message ID in the TimerTask instead of the whole message, and by keeping a WeakReference to the promise in the TimerTask. I've also modified this code to use Netty's HashedWheelTimer, whose performance characteristics should be better for this use-case. Author: Kousuke Saruta Closes #3321 from sarutak/connection-manager-timeout-bugfix and squashes the following commits: 786af91 [Kousuke Saruta] Fixed memory leak issue of ConnectionManager --- .../spark/network/ConnectionManager.scala | 52 ++++++++++++++----- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala index 578d806263006..6d58129babc88 100644 --- a/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala +++ b/core/src/main/scala/org/apache/spark/network/ConnectionManager.scala @@ -18,11 +18,11 @@ package org.apache.spark.network import java.io.IOException +import java.lang.ref.WeakReference import java.nio._ import java.nio.channels._ import java.nio.channels.spi._ import java.net._ -import java.util.{Timer, TimerTask} import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.{LinkedBlockingDeque, TimeUnit, ThreadPoolExecutor} @@ -37,6 +37,8 @@ import scala.concurrent.{Await, ExecutionContext, Future, Promise} import scala.concurrent.duration._ import scala.language.postfixOps +import io.netty.util.{Timeout, TimerTask, HashedWheelTimer} + import org.apache.spark._ import org.apache.spark.util.{SystemClock, Utils} @@ -68,7 +70,8 @@ private[spark] class ConnectionManager( } private val selector = SelectorProvider.provider.openSelector() - private val ackTimeoutMonitor = new Timer("AckTimeoutMonitor", true) + private val ackTimeoutMonitor = + new HashedWheelTimer(Utils.namedThreadFactory("AckTimeoutMonitor")) // default to 30 second timeout waiting for authentication private val authTimeout = conf.getInt("spark.core.connection.auth.wait.timeout", 30) @@ -105,7 +108,10 @@ private[spark] class ConnectionManager( new HashMap[SelectionKey, Connection] with SynchronizedMap[SelectionKey, Connection] private val connectionsById = new HashMap[ConnectionManagerId, SendingConnection] with SynchronizedMap[ConnectionManagerId, SendingConnection] - private val messageStatuses = new HashMap[Int, MessageStatus] + // Tracks sent messages for which we are awaiting acknowledgements. Entries are added to this + // map when messages are sent and are removed when acknowledgement messages are received or when + // acknowledgement timeouts expire + private val messageStatuses = new HashMap[Int, MessageStatus] // [MessageId, MessageStatus] private val keyInterestChangeRequests = new SynchronizedQueue[(SelectionKey, Int)] private val registerRequests = new SynchronizedQueue[SendingConnection] @@ -846,20 +852,41 @@ private[spark] class ConnectionManager( : Future[Message] = { val promise = Promise[Message]() - val timeoutTask = new TimerTask { - override def run(): Unit = { + // It's important that the TimerTask doesn't capture a reference to `message`, which can cause + // memory leaks since cancelled TimerTasks won't necessarily be garbage collected until the time + // at which they would originally be scheduled to run. Therefore, extract the message id + // from outside of the TimerTask closure (see SPARK-4393 for more context). + val messageId = message.id + // Keep a weak reference to the promise so that the completed promise may be garbage-collected + val promiseReference = new WeakReference(promise) + val timeoutTask: TimerTask = new TimerTask { + override def run(timeout: Timeout): Unit = { messageStatuses.synchronized { - messageStatuses.remove(message.id).foreach ( s => { - promise.failure( - new IOException("sendMessageReliably failed because ack " + - s"was not received within $ackTimeout sec")) - }) + messageStatuses.remove(messageId).foreach { s => + val e = new IOException("sendMessageReliably failed because ack " + + s"was not received within $ackTimeout sec") + val p = promiseReference.get + if (p != null) { + // Attempt to fail the promise with a Timeout exception + if (!p.tryFailure(e)) { + // If we reach here, then someone else has already signalled success or failure + // on this promise, so log a warning: + logError("Ignore error because promise is completed", e) + } + } else { + // The WeakReference was empty, which should never happen because + // sendMessageReliably's caller should have a strong reference to promise.future; + logError("Promise was garbage collected; this should never happen!", e) + } + } } } } + val timeoutTaskHandle = ackTimeoutMonitor.newTimeout(timeoutTask, ackTimeout, TimeUnit.SECONDS) + val status = new MessageStatus(message, connectionManagerId, s => { - timeoutTask.cancel() + timeoutTaskHandle.cancel() s.ackMessage match { case None => // Indicates a failure where we either never sent or never got ACK'd promise.failure(new IOException("sendMessageReliably failed without being ACK'd")) @@ -876,7 +903,6 @@ private[spark] class ConnectionManager( messageStatuses += ((message.id, status)) } - ackTimeoutMonitor.schedule(timeoutTask, ackTimeout * 1000) sendMessage(connectionManagerId, message) promise.future } @@ -886,7 +912,7 @@ private[spark] class ConnectionManager( } def stop() { - ackTimeoutMonitor.cancel() + ackTimeoutMonitor.stop() selectorThread.interrupt() selectorThread.join() selector.close() From ae9b1f69061401cf47d5a2e3dec79b18a7ef6bad Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Tue, 18 Nov 2014 16:25:44 -0800 Subject: [PATCH 06/72] [SPARK-4433] fix a racing condition in zipWithIndex Spark hangs with the following code: ~~~ sc.parallelize(1 to 10).zipWithIndex.repartition(10).count() ~~~ This is because ZippedWithIndexRDD triggers a job in getPartitions and it causes a deadlock in DAGScheduler.getPreferredLocs (synced). The fix is to compute `startIndices` during construction. This should be applied to branch-1.0, branch-1.1, and branch-1.2. pwendell Author: Xiangrui Meng Closes #3291 from mengxr/SPARK-4433 and squashes the following commits: c284d9f [Xiangrui Meng] fix a racing condition in zipWithIndex (cherry picked from commit bb46046154a438df4db30a0e1fd557bd3399ee7b) Signed-off-by: Xiangrui Meng --- .../apache/spark/rdd/ZippedWithIndexRDD.scala | 31 ++++++++++--------- .../scala/org/apache/spark/rdd/RDDSuite.scala | 5 +++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala index e2c301603b4a5..8c43a559409f2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ZippedWithIndexRDD.scala @@ -39,21 +39,24 @@ class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) private[spark] class ZippedWithIndexRDD[T: ClassTag](@transient prev: RDD[T]) extends RDD[(T, Long)](prev) { - override def getPartitions: Array[Partition] = { + /** The start index of each partition. */ + @transient private val startIndices: Array[Long] = { val n = prev.partitions.size - val startIndices: Array[Long] = - if (n == 0) { - Array[Long]() - } else if (n == 1) { - Array(0L) - } else { - prev.context.runJob( - prev, - Utils.getIteratorSize _, - 0 until n - 1, // do not need to count the last partition - false - ).scanLeft(0L)(_ + _) - } + if (n == 0) { + Array[Long]() + } else if (n == 1) { + Array(0L) + } else { + prev.context.runJob( + prev, + Utils.getIteratorSize _, + 0 until n - 1, // do not need to count the last partition + allowLocal = false + ).scanLeft(0L)(_ + _) + } + } + + override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index cb0bfb43ecafe..96b11654a1bd7 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -719,6 +719,11 @@ class RDDSuite extends FunSuite with SharedSparkContext { } } + test("zipWithIndex chained with other RDDs (SPARK-4433)") { + val count = sc.parallelize(0 until 10, 2).zipWithIndex().repartition(4).count() + assert(count === 10) + } + test("zipWithUniqueId") { val n = 10 val data = sc.parallelize(0 until n, 3) From f9739b9c886b1c207753ebf7067c09a60eff1695 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Tue, 18 Nov 2014 17:40:24 -0800 Subject: [PATCH 07/72] [SPARK-4468][SQL] Backports #3334 to branch-1.1 [Review on Reviewable](https://reviewable.io/reviews/apache/spark/3338) Author: Cheng Lian Closes #3338 from liancheng/spark-3334-for-1.1 and squashes the following commits: bd17512 [Cheng Lian] Backports #3334 to branch-1.1 --- .../spark/sql/parquet/ParquetFilters.scala | 13 ++- .../spark/sql/parquet/ParquetQuerySuite.scala | 107 +++++++++++------- 2 files changed, 75 insertions(+), 45 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala index 7c83f1cad7d71..0365c34c80241 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetFilters.scala @@ -213,22 +213,27 @@ private[sql] object ParquetFilters { Some(createEqualityFilter(right.name, left, p)) case p @ EqualTo(left: NamedExpression, right: Literal) if !left.nullable => Some(createEqualityFilter(left.name, right, p)) + case p @ LessThan(left: Literal, right: NamedExpression) if !right.nullable => - Some(createLessThanFilter(right.name, left, p)) + Some(createGreaterThanFilter(right.name, left, p)) case p @ LessThan(left: NamedExpression, right: Literal) if !left.nullable => Some(createLessThanFilter(left.name, right, p)) + case p @ LessThanOrEqual(left: Literal, right: NamedExpression) if !right.nullable => - Some(createLessThanOrEqualFilter(right.name, left, p)) + Some(createGreaterThanOrEqualFilter(right.name, left, p)) case p @ LessThanOrEqual(left: NamedExpression, right: Literal) if !left.nullable => Some(createLessThanOrEqualFilter(left.name, right, p)) + case p @ GreaterThan(left: Literal, right: NamedExpression) if !right.nullable => - Some(createGreaterThanFilter(right.name, left, p)) + Some(createLessThanFilter(right.name, left, p)) case p @ GreaterThan(left: NamedExpression, right: Literal) if !left.nullable => Some(createGreaterThanFilter(left.name, right, p)) + case p @ GreaterThanOrEqual(left: Literal, right: NamedExpression) if !right.nullable => - Some(createGreaterThanOrEqualFilter(right.name, left, p)) + Some(createLessThanOrEqualFilter(right.name, left, p)) case p @ GreaterThanOrEqual(left: NamedExpression, right: Literal) if !left.nullable => Some(createGreaterThanOrEqualFilter(left.name, right, p)) + case _ => None } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala index c6b790a4b6a23..10df1fac210a6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala @@ -17,20 +17,19 @@ package org.apache.spark.sql.parquet +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.mapreduce.Job import org.scalatest.{BeforeAndAfterAll, FunSuiteLike} - import parquet.hadoop.ParquetFileWriter import parquet.hadoop.util.ContextUtil -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.{SqlLexical, SqlParser} import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAttribute} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.types.{BooleanType, IntegerType} import org.apache.spark.sql.catalyst.util.getTempFilePath +import org.apache.spark.sql.catalyst.{SqlLexical, SqlParser} import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext._ import org.apache.spark.util.Utils @@ -453,43 +452,46 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA } test("create RecordFilter for simple predicates") { - val attribute1 = new AttributeReference("first", IntegerType, false)() - val predicate1 = new EqualTo(attribute1, new Literal(1, IntegerType)) - val filter1 = ParquetFilters.createFilter(predicate1) - assert(filter1.isDefined) - assert(filter1.get.predicate == predicate1, "predicates do not match") - assert(filter1.get.isInstanceOf[ComparisonFilter]) - val cmpFilter1 = filter1.get.asInstanceOf[ComparisonFilter] - assert(cmpFilter1.columnName == "first", "column name incorrect") - - val predicate2 = new LessThan(attribute1, new Literal(4, IntegerType)) - val filter2 = ParquetFilters.createFilter(predicate2) - assert(filter2.isDefined) - assert(filter2.get.predicate == predicate2, "predicates do not match") - assert(filter2.get.isInstanceOf[ComparisonFilter]) - val cmpFilter2 = filter2.get.asInstanceOf[ComparisonFilter] - assert(cmpFilter2.columnName == "first", "column name incorrect") - - val predicate3 = new And(predicate1, predicate2) - val filter3 = ParquetFilters.createFilter(predicate3) - assert(filter3.isDefined) - assert(filter3.get.predicate == predicate3, "predicates do not match") - assert(filter3.get.isInstanceOf[AndFilter]) - - val predicate4 = new Or(predicate1, predicate2) - val filter4 = ParquetFilters.createFilter(predicate4) - assert(filter4.isDefined) - assert(filter4.get.predicate == predicate4, "predicates do not match") - assert(filter4.get.isInstanceOf[OrFilter]) - - val attribute2 = new AttributeReference("second", IntegerType, false)() - val predicate5 = new GreaterThan(attribute1, attribute2) - val badfilter = ParquetFilters.createFilter(predicate5) - assert(badfilter.isDefined === false) - - val predicate6 = And(GreaterThan(attribute1, attribute2), GreaterThan(attribute1, attribute2)) - val badfilter2 = ParquetFilters.createFilter(predicate6) - assert(badfilter2.isDefined === false) + def checkFilter(predicate: Predicate): Option[CatalystFilter] = { + ParquetFilters.createFilter(predicate).map { f => + assertResult(predicate)(f.predicate) + f + }.orElse { + fail(s"filter $predicate not pushed down") + } + } + + def checkComparisonFilter(predicate: Predicate, columnName: String): Unit = { + assertResult(columnName, "column name incorrect") { + checkFilter(predicate).map(_.asInstanceOf[ComparisonFilter].columnName).get + } + } + + def checkInvalidFilter(predicate: Predicate): Unit = { + assert(ParquetFilters.createFilter(predicate).isEmpty) + } + + val a = 'a.int.notNull + val b = 'b.int.notNull + + checkComparisonFilter(a === 1, "a") + checkComparisonFilter(Literal(1) === a, "a") + + checkComparisonFilter(a < 4, "a") + checkComparisonFilter(a > 4, "a") + checkComparisonFilter(a <= 4, "a") + checkComparisonFilter(a >= 4, "a") + + checkComparisonFilter(Literal(4) > a, "a") + checkComparisonFilter(Literal(4) < a, "a") + checkComparisonFilter(Literal(4) >= a, "a") + checkComparisonFilter(Literal(4) <= a, "a") + + checkFilter(a === 1 && a < 4) + checkFilter(a === 1 || a < 4) + + checkInvalidFilter(a > b) + checkInvalidFilter((a > b) && (a > b)) } test("test filter by predicate pushdown") { @@ -516,6 +518,29 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA assert(result2(49)(1) === 199) } } + for(myval <- Seq("myint", "mylong", "mydouble", "myfloat")) { + val query1 = sql(s"SELECT * FROM testfiltersource WHERE 150 > $myval AND 100 <= $myval") + assert( + query1.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan], + "Top operator should be ParquetTableScan after pushdown") + val result1 = query1.collect() + assert(result1.size === 50) + assert(result1(0)(1) === 100) + assert(result1(49)(1) === 149) + val query2 = sql(s"SELECT * FROM testfiltersource WHERE 150 < $myval AND 200 >= $myval") + assert( + query2.queryExecution.executedPlan(0)(0).isInstanceOf[ParquetTableScan], + "Top operator should be ParquetTableScan after pushdown") + val result2 = query2.collect() + assert(result2.size === 50) + if (myval == "myint" || myval == "mylong") { + assert(result2(0)(1) === 151) + assert(result2(49)(1) === 200) + } else { + assert(result2(0)(1) === 150) + assert(result2(49)(1) === 199) + } + } for(myval <- Seq("myint", "mylong")) { val query3 = sql(s"SELECT * FROM testfiltersource WHERE $myval > 190 OR $myval < 10") assert( From e22a75923e508e17b924f341d02cd5cd679210ca Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 18 Nov 2014 20:15:00 -0800 Subject: [PATCH 08/72] [SPARK-4380] Log more precise number of bytes spilled (1.1) This is the branch-1.1 version of #3243. Author: Andrew Or Closes #3355 from andrewor14/spill-log-bytes-1.1 and squashes the following commits: 36ec152 [Andrew Or] Log more precise representation of bytes in spilling code --- .../apache/spark/util/collection/ExternalAppendOnlyMap.scala | 5 +++-- .../org/apache/spark/util/collection/ExternalSorter.scala | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index 3e09c2599085c..96697d2e603a2 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -174,8 +174,9 @@ class ExternalAppendOnlyMap[K, V, C]( private def spill(mapSize: Long): Unit = { spillCount += 1 val threadId = Thread.currentThread().getId - logInfo("Thread %d spilling in-memory map of %d MB to disk (%d time%s so far)" - .format(threadId, mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else "")) + logInfo("Thread %d spilling in-memory batch of %s to disk (%d times%s so far)" + .format(threadId, org.apache.spark.util.Utils.bytesToString(mapSize), + spillCount, if (spillCount > 1) "s" else "")) val (blockId, file) = diskBlockManager.createTempLocalBlock() curWriteMetrics = new ShuffleWriteMetrics() var writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize, diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index 97ddd96c98268..d414ce39e9ae0 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -268,8 +268,9 @@ private[spark] class ExternalSorter[K, V, C]( spillCount += 1 val threadId = Thread.currentThread().getId - logInfo("Thread %d spilling in-memory batch of %d MB to disk (%d spill%s so far)" - .format(threadId, memorySize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else "")) + logInfo("Thread %d spilling in-memory batch of %s to disk (%d spill%s so far)" + .format(threadId, org.apache.spark.util.Utils.bytesToString(memorySize), + spillCount, if (spillCount > 1) "s" else "")) if (bypassMergeSort) { spillToPartitionFiles(collection) From 16bf5f3d17624db2a96c921fe8a1e153cdafb06c Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 19 Nov 2014 10:45:42 -0800 Subject: [PATCH 09/72] [SPARK-4480] Avoid many small spills in external data structures (1.1) This is the branch-1.1 version of #3353. This requires a separate PR because the code in master has been refactored a little to eliminate duplicate code. I have tested this on a standalone cluster. The goal is to merge this into 1.1.1. Author: Andrew Or Closes #3354 from andrewor14/avoid-small-spills-1.1 and squashes the following commits: f2e552c [Andrew Or] Fix tests 7012595 [Andrew Or] Avoid many small spills --- .../spark/shuffle/ShuffleMemoryManager.scala | 7 +++++-- .../collection/ExternalAppendOnlyMap.scala | 20 ++++++++++++++----- .../util/collection/ExternalSorter.scala | 18 +++++++++++++---- .../util/collection/ExternalSorterSuite.scala | 4 +++- 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala index ee91a368b76ea..c746e138b63c9 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala @@ -19,7 +19,7 @@ package org.apache.spark.shuffle import scala.collection.mutable -import org.apache.spark.{Logging, SparkException, SparkConf} +import org.apache.spark.{Logging, SparkConf, SparkEnv, SparkException} /** * Allocates a pool of memory to task threads for use in shuffle operations. Each disk-spilling @@ -111,7 +111,7 @@ private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging { } } -private object ShuffleMemoryManager { +private[spark] object ShuffleMemoryManager { /** * Figure out the shuffle memory limit from a SparkConf. We currently have both a fraction * of the memory pool and a safety factor since collections can sometimes grow bigger than @@ -122,4 +122,7 @@ private object ShuffleMemoryManager { val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8) (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong } + + // Initial threshold for the size of a collection before we start tracking its memory usage + val DEFAULT_INITIAL_MEMORY_THRESHOLD: Long = 5 * 1024 * 1024 } diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index 96697d2e603a2..5619b30d0d1e7 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -28,10 +28,11 @@ import com.google.common.io.ByteStreams import org.apache.spark.{Logging, SparkEnv} import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.{DeserializationStream, Serializer} +import org.apache.spark.shuffle.ShuffleMemoryManager import org.apache.spark.storage.{BlockId, BlockManager} import org.apache.spark.util.collection.ExternalAppendOnlyMap.HashComparator -import org.apache.spark.executor.ShuffleWriteMetrics /** * :: DeveloperApi :: @@ -81,8 +82,14 @@ class ExternalAppendOnlyMap[K, V, C]( // Number of in-memory pairs inserted before tracking the map's shuffle memory usage private val trackMemoryThreshold = 1000 - // How much of the shared memory pool this collection has claimed - private var myMemoryThreshold = 0L + // Initial threshold for the size of a collection before we start tracking its memory usage + private val initialMemoryThreshold = + SparkEnv.get.conf.getLong("spark.shuffle.spill.initialMemoryThreshold", + ShuffleMemoryManager.DEFAULT_INITIAL_MEMORY_THRESHOLD) + + // Threshold for the collection's size in bytes before we start tracking its memory usage + // To avoid a large number of small spills, initialize this to a value orders of magnitude > 0 + private var myMemoryThreshold = initialMemoryThreshold /** * Size of object batches when reading/writing from serializers. @@ -236,8 +243,11 @@ class ExternalAppendOnlyMap[K, V, C]( spilledMaps.append(new DiskMapIterator(file, blockId, batchSizes)) // Release our memory back to the shuffle pool so that other threads can grab it - shuffleMemoryManager.release(myMemoryThreshold) - myMemoryThreshold = 0L + // The amount we requested does not include the initial memory tracking threshold + shuffleMemoryManager.release(myMemoryThreshold - initialMemoryThreshold) + + // Reset this to the initial threshold to avoid spilling many small files + myMemoryThreshold = initialMemoryThreshold elementsRead = 0 _memoryBytesSpilled += mapSize diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index d414ce39e9ae0..a049746bd81c0 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -28,6 +28,7 @@ import com.google.common.io.ByteStreams import org.apache.spark._ import org.apache.spark.serializer.{DeserializationStream, Serializer} import org.apache.spark.executor.ShuffleWriteMetrics +import org.apache.spark.shuffle.ShuffleMemoryManager import org.apache.spark.storage.{BlockObjectWriter, BlockId} /** @@ -134,8 +135,14 @@ private[spark] class ExternalSorter[K, V, C]( // Write metrics for current spill private var curWriteMetrics: ShuffleWriteMetrics = _ - // How much of the shared memory pool this collection has claimed - private var myMemoryThreshold = 0L + // Initial threshold for the size of a collection before we start tracking its memory usage + private val initialMemoryThreshold = + SparkEnv.get.conf.getLong("spark.shuffle.spill.initialMemoryThreshold", + ShuffleMemoryManager.DEFAULT_INITIAL_MEMORY_THRESHOLD) + + // Threshold for the collection's size in bytes before we start tracking its memory usage + // To avoid a large number of small spills, initialize this to a value orders of magnitude > 0 + private var myMemoryThreshold = initialMemoryThreshold // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't need // local aggregation and sorting, write numPartitions files directly and just concatenate them @@ -285,8 +292,11 @@ private[spark] class ExternalSorter[K, V, C]( } // Release our memory back to the shuffle pool so that other threads can grab it - shuffleMemoryManager.release(myMemoryThreshold) - myMemoryThreshold = 0 + // The amount we requested does not include the initial memory tracking threshold + shuffleMemoryManager.release(myMemoryThreshold - initialMemoryThreshold) + + // Reset this to the initial threshold to avoid spilling many small files + myMemoryThreshold = initialMemoryThreshold _memoryBytesSpilled += memorySize elementsRead = 0 diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala index f26e40fbd4b36..f4db3ff431b8f 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala @@ -127,6 +127,7 @@ class ExternalSorterSuite extends FunSuite with LocalSparkContext with PrivateMe test("empty partitions with spilling") { val conf = createSparkConf(false) conf.set("spark.shuffle.memoryFraction", "0.001") + conf.set("spark.shuffle.spill.initialMemoryThreshold", "512") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager") sc = new SparkContext("local", "test", conf) @@ -152,6 +153,7 @@ class ExternalSorterSuite extends FunSuite with LocalSparkContext with PrivateMe test("empty partitions with spilling, bypass merge-sort") { val conf = createSparkConf(false) conf.set("spark.shuffle.memoryFraction", "0.001") + conf.set("spark.shuffle.spill.initialMemoryThreshold", "512") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager") sc = new SparkContext("local", "test", conf) @@ -761,5 +763,5 @@ class ExternalSorterSuite extends FunSuite with LocalSparkContext with PrivateMe } sorter2.stop() - } + } } From aa3c794f8a0e2441da72346fdec8f1d499a6f841 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 19 Nov 2014 19:35:43 +0000 Subject: [PATCH 10/72] Update CHANGES.txt for 1.1.1-rc2 --- CHANGES.txt | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index dc643ed184d9b..7292d91b80824 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -3,6 +3,71 @@ Spark Change Log Release 1.1.1 + [SPARK-4480] Avoid many small spills in external data structures (1.1) + Andrew Or + 2014-11-19 10:45:42 -0800 + Commit: 16bf5f3, github.com/apache/spark/pull/3354 + + [SPARK-4380] Log more precise number of bytes spilled (1.1) + Andrew Or + 2014-11-18 20:15:00 -0800 + Commit: e22a759, github.com/apache/spark/pull/3355 + + [SPARK-4468][SQL] Backports #3334 to branch-1.1 + Cheng Lian + 2014-11-18 17:40:24 -0800 + Commit: f9739b9, github.com/apache/spark/pull/3338 + + [SPARK-4433] fix a racing condition in zipWithIndex + Xiangrui Meng + 2014-11-18 16:25:44 -0800 + Commit: ae9b1f6, github.com/apache/spark/pull/3291 + + [SPARK-4393] Fix memory leak in ConnectionManager ACK timeout TimerTasks; use HashedWheelTimer (For branch-1.1) + Kousuke Saruta + 2014-11-18 12:09:18 -0800 + Commit: 91b5fa8, github.com/apache/spark/pull/3321 + + [SPARK-4467] Partial fix for fetch failure in sort-based shuffle (1.1) + Andrew Or + 2014-11-17 18:10:49 -0800 + Commit: aa9ebda, github.com/apache/spark/pull/3330 + + Revert "[SPARK-4075] [Deploy] Jar url validation is not enough for Jar file" + Andrew Or + 2014-11-17 11:25:38 -0800 + Commit: b528367 + + [branch-1.1][SPARK-4355] OnlineSummarizer doesn't merge mean correctly + Xiangrui Meng + 2014-11-13 15:36:03 -0800 + Commit: 4b1c77c, github.com/apache/spark/pull/3251 + + [Release] Correct make-distribution.sh log path + Andrew Or + 2014-11-12 13:46:26 -0800 + Commit: ba6d81d + + [Release] Bring audit scripts up-to-date + Andrew Or + 2014-11-13 00:30:58 +0000 + Commit: 88bc482 + + [Release] Log build output for each distribution + Andrew Or + 2014-11-11 18:02:59 -0800 + Commit: e3a5ee9 + + Revert "SPARK-3039: Allow spark to be built using avro-mapred for hadoop2" + Andrew Or + 2014-11-12 00:04:30 -0800 + Commit: 45a01b6 + + Update CHANGES.txt + Andrew Or + 2014-11-11 23:11:32 +0000 + Commit: 131c626 + [SPARK-4295][External]Fix exception in SparkSinkSuite maji2014 2014-11-11 02:18:27 -0800 From 3693ae5d3c01861557e06edbc32a8112683f3d86 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 19 Nov 2014 20:10:56 +0000 Subject: [PATCH 11/72] [maven-release-plugin] prepare release v1.1.1-rc2 --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index 0531fb7b37268..e79d29b999f19 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index f29540b239c73..f1c2b2171e010 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 0b9486fac522a..96effdb711165 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index 6124cf8552c14..aa748c6de0f12 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 17d0fe233873f..c443eaa238407 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 402af353152d8..6d06a2da640f3 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 5123d0554639c..09602f672516b 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 9c00bfc8429a4..462079cbf2eb8 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 1b9ef4af0c2ed..c1628831bf258 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 60292a2683212..7670646bc6ea5 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 58b995c5e7005..2433818b07be0 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index 02c9676fb086a..b6f4875f9d7ca 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index 656478583fac2..cb3d09038e4da 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 74f528f030987..9443f6118b3bb 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/pom.xml b/pom.xml index 97df626fbab40..0848bd11cde7c 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 pom Spark Project Parent POM http://spark.apache.org/ @@ -40,7 +40,7 @@ scm:git:git@github.com:apache/spark.git scm:git:https://git-wip-us.apache.org/repos/asf/spark.git scm:git:git@github.com:apache/spark.git - HEAD + v1.1.1-rc2 diff --git a/repl/pom.xml b/repl/pom.xml index 8748ada36f57a..b0b9196bc7b86 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index e2356381c07fb..19ca3044f987e 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 3efea9ab8b247..148a521d6e5e0 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index c264ff4ec92e5..c3c2f552f25e1 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 1e689e6d6dcf2..ff01bc6dc7f8c 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index c0ce0d7c7478d..9b56e32764a5f 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index c601fd5fbbee2..a2bb5af01dcd3 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 18f27b827ff1a..2fad21bb08505 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index 2ba3baf0e3b2e..74a30ae8a7af9 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 1.1.1-SNAPSHOT + 1.1.1 ../pom.xml From 1df1c1d9bc6238ff1db268e36c2139a196109b0f Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 19 Nov 2014 20:11:02 +0000 Subject: [PATCH 12/72] [maven-release-plugin] prepare for next development iteration --- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 4 ++-- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- yarn/stable/pom.xml | 2 +- 24 files changed, 25 insertions(+), 25 deletions(-) diff --git a/assembly/pom.xml b/assembly/pom.xml index e79d29b999f19..5e931b7457210 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/bagel/pom.xml b/bagel/pom.xml index f1c2b2171e010..83faf29de577f 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/core/pom.xml b/core/pom.xml index 96effdb711165..664e52f761d32 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/examples/pom.xml b/examples/pom.xml index aa748c6de0f12..4d9da0191face 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index c443eaa238407..692f87b1cb0ec 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 6d06a2da640f3..5d0f7ffc5390a 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 09602f672516b..b267c4757623c 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 462079cbf2eb8..c7fbf4b95500e 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index c1628831bf258..36afc14c64fc5 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index 7670646bc6ea5..5ba0fb1a4a7f3 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/extras/kinesis-asl/pom.xml b/extras/kinesis-asl/pom.xml index 2433818b07be0..a872bf2327fc6 100644 --- a/extras/kinesis-asl/pom.xml +++ b/extras/kinesis-asl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml index b6f4875f9d7ca..430a5e7cbfde0 100644 --- a/extras/spark-ganglia-lgpl/pom.xml +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/graphx/pom.xml b/graphx/pom.xml index cb3d09038e4da..b3c843a5f3c4c 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/mllib/pom.xml b/mllib/pom.xml index 9443f6118b3bb..f228212091b44 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index 0848bd11cde7c..d185ccccf3686 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT pom Spark Project Parent POM http://spark.apache.org/ @@ -40,7 +40,7 @@ scm:git:git@github.com:apache/spark.git scm:git:https://git-wip-us.apache.org/repos/asf/spark.git scm:git:git@github.com:apache/spark.git - v1.1.1-rc2 + HEAD diff --git a/repl/pom.xml b/repl/pom.xml index b0b9196bc7b86..ef8d40d84285a 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml index 19ca3044f987e..909dc94230071 100644 --- a/sql/catalyst/pom.xml +++ b/sql/catalyst/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/sql/core/pom.xml b/sql/core/pom.xml index 148a521d6e5e0..f6a5f955cdd64 100644 --- a/sql/core/pom.xml +++ b/sql/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index c3c2f552f25e1..b7540f02ce8ea 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index ff01bc6dc7f8c..241e305cad986 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/streaming/pom.xml b/streaming/pom.xml index 9b56e32764a5f..09c403f08b16a 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/tools/pom.xml b/tools/pom.xml index a2bb5af01dcd3..09894f4eb07ea 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/yarn/pom.xml b/yarn/pom.xml index 2fad21bb08505..bf298998e559f 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index 74a30ae8a7af9..c31eb0faf4368 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml From 1b2b7dd8ddfdd30c2c2de8668e1086bc894ab2f0 Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Mon, 24 Nov 2014 11:56:18 -0800 Subject: [PATCH 13/72] Fixed merge typo --- tools/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pom.xml b/tools/pom.xml index 51569887cf3b8..b47aab79a17ac 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1-csd-`-SNAPSHOT + 1.1.1-csd-1-SNAPSHOT ../pom.xml From 63717375d7e592a5a0ecf29662bfd285bf8e1115 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Mon, 24 Nov 2014 14:29:40 -0800 Subject: [PATCH 14/72] Update versions to 1.1.2-SNAPSHOT --- core/src/main/scala/org/apache/spark/SparkContext.scala | 2 +- docs/_config.yml | 4 ++-- extras/java8-tests/pom.xml | 2 +- python/epydoc.conf | 2 +- python/pyspark/shell.py | 2 +- .../src/main/scala/org/apache/spark/repl/SparkILoopInit.scala | 2 +- yarn/alpha/pom.xml | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index cea5cf2022b7b..e1e7e83570c4b 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -1334,7 +1334,7 @@ class SparkContext(config: SparkConf) extends Logging { */ object SparkContext extends Logging { - private[spark] val SPARK_VERSION = "1.1.1" + private[spark] val SPARK_VERSION = "1.1.2-SNAPSHOT" private[spark] val SPARK_JOB_DESCRIPTION = "spark.job.description" diff --git a/docs/_config.yml b/docs/_config.yml index d777f619479a6..33d6921e8e9da 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -3,8 +3,8 @@ markdown: kramdown # These allow the documentation to be updated with nerw releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 1.1.1 -SPARK_VERSION_SHORT: 1.1.1 +SPARK_VERSION: 1.1.2 +SPARK_VERSION_SHORT: 1.1.2 SCALA_BINARY_VERSION: "2.10" SCALA_VERSION: "2.10.4" MESOS_VERSION: 0.18.1 diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml index e1f0d7fadaf97..061dddd3d0fd9 100644 --- a/extras/java8-tests/pom.xml +++ b/extras/java8-tests/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 1.1.1 + 1.1.2-SNAPSHOT ../../pom.xml diff --git a/python/epydoc.conf b/python/epydoc.conf index d066ecb7712c5..284233898cbb9 100644 --- a/python/epydoc.conf +++ b/python/epydoc.conf @@ -18,7 +18,7 @@ # # Information about the project. -name: Spark 1.1.0 Python API Docs +name: Spark 1.1.2 Python API Docs url: http://spark.apache.org # The list of modules to document. Modules can be named using diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 0ee431cf6749c..99054a51f635f 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -47,7 +47,7 @@ ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 1.1.1 + /__ / .__/\_,_/_/ /_/\_\ version 1.1.2-SNAPSHOT /_/ """) print("Using Python version %s (%s, %s)" % ( diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala index d834261fbb91d..b6dfcd77879c6 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala @@ -26,7 +26,7 @@ trait SparkILoopInit { ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 1.1.1 + /___/ .__/\_,_/_/ /_/\_\ version 1.1.2-SNAPSHOT /_/ """) import Properties._ diff --git a/yarn/alpha/pom.xml b/yarn/alpha/pom.xml index 5337490eecdd9..6a6f8a0490ac7 100644 --- a/yarn/alpha/pom.xml +++ b/yarn/alpha/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 1.1.1 + 1.1.2-SNAPSHOT ../pom.xml From 7aa592c741cb50822e7d070bd240ac4fe7d7933f Mon Sep 17 00:00:00 2001 From: Tathagata Das Date: Tue, 25 Nov 2014 14:16:27 -0800 Subject: [PATCH 15/72] [SPARK-4196][SPARK-4602][Streaming] Fix serialization issue in PairDStreamFunctions.saveAsNewAPIHadoopFiles Solves two JIRAs in one shot - Makes the ForechDStream created by saveAsNewAPIHadoopFiles serializable for checkpoints - Makes the default configuration object used saveAsNewAPIHadoopFiles be the Spark's hadoop configuration Author: Tathagata Das Closes #3457 from tdas/savefiles-fix and squashes the following commits: bb4729a [Tathagata Das] Same treatment for saveAsHadoopFiles b382ea9 [Tathagata Das] Fix serialization issue in PairDStreamFunctions.saveAsNewAPIHadoopFiles. (cherry picked from commit 8838ad7c135a585cde015dc38b5cb23314502dd9) Signed-off-by: Tathagata Das --- .../dstream/PairDStreamFunctions.scala | 30 +++++----- .../spark/streaming/CheckpointSuite.scala | 56 ++++++++++++++++++- 2 files changed, 70 insertions(+), 16 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala index 826bf39e860e1..d80f964e1b719 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala @@ -17,20 +17,17 @@ package org.apache.spark.streaming.dstream -import org.apache.spark.streaming.StreamingContext._ - -import org.apache.spark.{Partitioner, HashPartitioner} -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD - import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag -import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} -import org.apache.hadoop.mapred.OutputFormat import org.apache.hadoop.conf.Configuration -import org.apache.spark.streaming.{Time, Duration} +import org.apache.hadoop.mapred.{JobConf, OutputFormat} +import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} + +import org.apache.spark.{HashPartitioner, Partitioner, SerializableWritable} +import org.apache.spark.rdd.RDD +import org.apache.spark.streaming.{Duration, Time} +import org.apache.spark.streaming.StreamingContext._ /** * Extra functions available on DStream of (key, value) pairs through an implicit conversion. @@ -590,11 +587,13 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)]) keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[_ <: OutputFormat[_, _]], - conf: JobConf = new JobConf + conf: JobConf = new JobConf(ssc.sparkContext.hadoopConfiguration) ) { + // Wrap conf in SerializableWritable so that ForeachDStream can be serialized for checkpoints + val serializableConf = new SerializableWritable(conf) val saveFunc = (rdd: RDD[(K, V)], time: Time) => { val file = rddToFileName(prefix, suffix, time) - rdd.saveAsHadoopFile(file, keyClass, valueClass, outputFormatClass, conf) + rdd.saveAsHadoopFile(file, keyClass, valueClass, outputFormatClass, serializableConf.value) } self.foreachRDD(saveFunc) } @@ -621,11 +620,14 @@ class PairDStreamFunctions[K, V](self: DStream[(K,V)]) keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[_ <: NewOutputFormat[_, _]], - conf: Configuration = new Configuration + conf: Configuration = ssc.sparkContext.hadoopConfiguration ) { + // Wrap conf in SerializableWritable so that ForeachDStream can be serialized for checkpoints + val serializableConf = new SerializableWritable(conf) val saveFunc = (rdd: RDD[(K, V)], time: Time) => { val file = rddToFileName(prefix, suffix, time) - rdd.saveAsNewAPIHadoopFile(file, keyClass, valueClass, outputFormatClass, conf) + rdd.saveAsNewAPIHadoopFile( + file, keyClass, valueClass, outputFormatClass, serializableConf.value) } self.foreachRDD(saveFunc) } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 10ad3c9e1adc9..3c6569635083b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -22,9 +22,14 @@ import java.nio.charset.Charset import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag + import com.google.common.io.Files -import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.{IntWritable, Text} +import org.apache.hadoop.mapred.TextOutputFormat +import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat} + import org.apache.spark.streaming.StreamingContext._ import org.apache.spark.streaming.dstream.{DStream, FileInputDStream} import org.apache.spark.streaming.util.ManualClock @@ -205,6 +210,51 @@ class CheckpointSuite extends TestSuiteBase { testCheckpointedOperation(input, operation, output, 7) } + test("recovery with saveAsHadoopFiles operation") { + val tempDir = Files.createTempDir() + try { + testCheckpointedOperation( + Seq(Seq("a", "a", "b"), Seq("", ""), Seq(), Seq("a", "a", "b"), Seq("", ""), Seq()), + (s: DStream[String]) => { + val output = s.map(x => (x, 1)).reduceByKey(_ + _) + output.saveAsHadoopFiles( + tempDir.toURI.toString, + "result", + classOf[Text], + classOf[IntWritable], + classOf[TextOutputFormat[Text, IntWritable]]) + output + }, + Seq(Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq(), Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq()), + 3 + ) + } finally { + Utils.deleteRecursively(tempDir) + } + } + + test("recovery with saveAsNewAPIHadoopFiles operation") { + val tempDir = Files.createTempDir() + try { + testCheckpointedOperation( + Seq(Seq("a", "a", "b"), Seq("", ""), Seq(), Seq("a", "a", "b"), Seq("", ""), Seq()), + (s: DStream[String]) => { + val output = s.map(x => (x, 1)).reduceByKey(_ + _) + output.saveAsNewAPIHadoopFiles( + tempDir.toURI.toString, + "result", + classOf[Text], + classOf[IntWritable], + classOf[NewTextOutputFormat[Text, IntWritable]]) + output + }, + Seq(Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq(), Seq(("a", 2), ("b", 1)), Seq(("", 2)), Seq()), + 3 + ) + } finally { + Utils.deleteRecursively(tempDir) + } + } // This tests whether the StateDStream's RDD checkpoints works correctly such // that the system can recover from a master failure. This assumes as reliable, @@ -392,7 +442,9 @@ class CheckpointSuite extends TestSuiteBase { logInfo("Manual clock after advancing = " + clock.time) Thread.sleep(batchDuration.milliseconds) - val outputStream = ssc.graph.getOutputStreams.head.asInstanceOf[TestOutputStreamWithPartitions[V]] + val outputStream = ssc.graph.getOutputStreams.filter { dstream => + dstream.isInstanceOf[TestOutputStreamWithPartitions[V]] + }.head.asInstanceOf[TestOutputStreamWithPartitions[V]] outputStream.output.map(_.flatten) } } From 1a7f4144efac1cd375b38ee3f53ea0a5d7692d1c Mon Sep 17 00:00:00 2001 From: Tathagata Das Date: Tue, 25 Nov 2014 15:27:20 -0800 Subject: [PATCH 16/72] [HOTFIX] Fixing broken build due to missing imports. --- .../apache/spark/streaming/dstream/PairDStreamFunctions.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala index d80f964e1b719..32b6977900442 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala @@ -26,6 +26,7 @@ import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} import org.apache.spark.{HashPartitioner, Partitioner, SerializableWritable} import org.apache.spark.rdd.RDD +import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.StreamingContext._ From a59c4457ed1613dd13f1e2a7bacba6a6f00c86bf Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 26 Nov 2014 23:16:23 -0800 Subject: [PATCH 17/72] [Release] Automate generation of contributors list This commit provides a script that computes the contributors list by linking the github commits with JIRA issues. Automatically translating github usernames remains a TODO at this point. --- dev/create-release/generate-contributors.py | 206 ++++++++++++++++++++ dev/create-release/releaseutils.py | 124 ++++++++++++ 2 files changed, 330 insertions(+) create mode 100755 dev/create-release/generate-contributors.py create mode 100755 dev/create-release/releaseutils.py diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py new file mode 100755 index 0000000000000..f4bf734081583 --- /dev/null +++ b/dev/create-release/generate-contributors.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script automates the process of creating release notes. + +import os +import re +import sys + +from releaseutils import * + +# You must set the following before use! +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +START_COMMIT = os.environ.get("START_COMMIT", "37b100") +END_COMMIT = os.environ.get("END_COMMIT", "3693ae") + +try: + from jira.client import JIRA +except ImportError: + print "This tool requires the jira-python library" + print "Install using 'sudo pip install jira-python'" + sys.exit(-1) + +try: + import unidecode +except ImportError: + print "This tool requires the unidecode library to decode obscure github usernames" + print "Install using 'sudo pip install unidecode'" + sys.exit(-1) + +# If commit range is not specified, prompt the user to provide it +if not START_COMMIT or not END_COMMIT: + print "A commit range is required to proceed." + if not START_COMMIT: + START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ") + if not END_COMMIT: + END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") + +# Verify provided arguments +start_commit_line = get_one_line(START_COMMIT) +end_commit_line = get_one_line(END_COMMIT) +num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) +if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT) +if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT) +if num_commits == 0: + sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT)) +print "\n==================================================================================" +print "JIRA server: %s" % JIRA_API_BASE +print "Start commit (inclusive): %s" % start_commit_line +print "End commit (non-inclusive): %s" % end_commit_line +print "Number of commits in this range: %s" % num_commits +print +response = raw_input("Is this correct? [Y/n] ") +if response.lower() != "y" and response: + sys.exit("Ok, exiting") +print "==================================================================================\n" + +# Find all commits within this range +print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) +commits = get_one_line_commits(START_COMMIT, END_COMMIT) +if not commits: sys.exit("Error: No commits found within this range!") +commits = commits.split("\n") + +# Filter out special commits +releases = [] +reverts = [] +nojiras = [] +filtered_commits = [] +def is_release(commit): + return re.findall("\[release\]", commit.lower()) or\ + "maven-release-plugin" in commit or "CHANGES.txt" in commit +def has_no_jira(commit): + return not re.findall("SPARK-[0-9]+", commit.upper()) +def is_revert(commit): + return "revert" in commit.lower() +def is_docs(commit): + return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower() +for c in commits: + if not c: continue + elif is_release(c): releases.append(c) + elif is_revert(c): reverts.append(c) + elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers + elif has_no_jira(c): nojiras.append(c) + else: filtered_commits.append(c) + +# Warn against ignored commits +def print_indented(_list): + for x in _list: print " %s" % x +if releases or reverts or nojiras: + print "\n==================================================================================" + if releases: print "Releases (%d)" % len(releases); print_indented(releases) + if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts) + if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) + print "==================== Warning: the above commits will be ignored ==================\n" +response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits)) +if response.lower() != "y": + sys.exit("Ok, exiting.") + +# Keep track of warnings to tell the user at the end +warnings = [] + +# Populate a map that groups issues and components by author +# It takes the form: Author name -> { Contribution type -> Spark components } +# For instance, +# { +# 'Andrew Or': { +# 'bug fixes': ['windows', 'core', 'web ui'], +# 'improvements': ['core'] +# }, +# 'Tathagata Das' : { +# 'bug fixes': ['streaming'] +# 'new feature': ['streaming'] +# } +# } +# +author_info = {} +jira_options = { "server": JIRA_API_BASE } +jira = JIRA(jira_options) +print "\n=========================== Compiling contributor list ===========================" +for commit in filtered_commits: + commit_hash = re.findall("^[a-z0-9]+", commit)[0] + issues = re.findall("SPARK-[0-9]+", commit.upper()) + author = get_author(commit_hash) + author = unidecode.unidecode(unicode(author, "UTF-8")) # guard against special characters + date = get_date(commit_hash) + # Parse components from the commit message, if any + commit_components = find_components(commit, commit_hash) + # Populate or merge an issue into author_info[author] + def populate(issue_type, components): + components = components or [CORE_COMPONENT] # assume core if no components provided + if author not in author_info: + author_info[author] = {} + if issue_type not in author_info[author]: + author_info[author][issue_type] = set() + for component in all_components: + author_info[author][issue_type].add(component) + # Find issues and components associated with this commit + for issue in issues: + jira_issue = jira.issue(issue) + jira_type = jira_issue.fields.issuetype.name + jira_type = translate_issue_type(jira_type, issue, warnings) + jira_components = [translate_component(c.name, commit_hash, warnings)\ + for c in jira_issue.fields.components] + all_components = set(jira_components + commit_components) + populate(jira_type, all_components) + # For docs without an associated JIRA, manually add it ourselves + if is_docs(commit) and not issues: + populate("documentation", commit_components) + print " Processed commit %s authored by %s on %s" % (commit_hash, author, date) +print "==================================================================================\n" + +# Write to contributors file ordered by author names +# Each line takes the format "Author name - semi-colon delimited contributions" +# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core +# e.g. Tathagata Das - Bug fixes and new features in Streaming +contributors_file_name = "contributors.txt" +contributors_file = open(contributors_file_name, "w") +authors = author_info.keys() +authors.sort() +for author in authors: + contribution = "" + components = set() + issue_types = set() + for issue_type, comps in author_info[author].items(): + components.update(comps) + issue_types.add(issue_type) + # If there is only one component, mention it only once + # e.g. Bug fixes, improvements in MLlib + if len(components) == 1: + contribution = "%s in %s" % (nice_join(issue_types), next(iter(components))) + # Otherwise, group contributions by issue types instead of modules + # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN + else: + contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ + for issue_type, comps in author_info[author].items()] + contribution = "; ".join(contributions) + # Do not use python's capitalize() on the whole string to preserve case + assert contribution + contribution = contribution[0].capitalize() + contribution[1:] + line = "%s - %s" % (author, contribution) + contributors_file.write(line + "\n") +contributors_file.close() +print "Contributors list is successfully written to %s!" % contributors_file_name + +# Log any warnings encountered in the process +if warnings: + print "\n============ Warnings encountered while creating the contributor list ============" + for w in warnings: print w + print "Please correct these in the final contributors list at %s." % contributors_file_name + print "==================================================================================\n" + diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py new file mode 100755 index 0000000000000..e56d7fa58fa2c --- /dev/null +++ b/dev/create-release/releaseutils.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file contains helper methods used in creating a release. + +import re +from subprocess import Popen, PIPE + +# Utility functions run git commands (written with Git 1.8.5) +def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] +def get_author(commit_hash): + return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash]) +def get_date(commit_hash): + return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) +def get_one_line(commit_hash): + return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash]) +def get_one_line_commits(start_hash, end_hash): + return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)]) +def num_commits_in_range(start_hash, end_hash): + output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)]) + lines = [line for line in output.split("\n") if line] # filter out empty lines + return len(lines) + +# Maintain a mapping for translating issue types to contributions in the release notes +# This serves an additional function of warning the user against unknown issue types +# Note: This list is partially derived from this link: +# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/issuetypes +# Keep these in lower case +known_issue_types = { + "bug": "bug fixes", + "build": "build fixes", + "improvement": "improvements", + "new feature": "new features", + "documentation": "documentation" +} + +# Maintain a mapping for translating component names when creating the release notes +# This serves an additional function of warning the user against unknown components +# Note: This list is largely derived from this link: +# https://issues.apache.org/jira/plugins/servlet/project-config/SPARK/components +CORE_COMPONENT = "Core" +known_components = { + "block manager": CORE_COMPONENT, + "build": CORE_COMPONENT, + "deploy": CORE_COMPONENT, + "documentation": CORE_COMPONENT, + "ec2": "EC2", + "examples": CORE_COMPONENT, + "graphx": "GraphX", + "input/output": CORE_COMPONENT, + "java api": "Java API", + "mesos": "Mesos", + "ml": "MLlib", + "mllib": "MLlib", + "project infra": "Project Infra", + "pyspark": "PySpark", + "shuffle": "Shuffle", + "spark core": CORE_COMPONENT, + "spark shell": CORE_COMPONENT, + "sql": "SQL", + "streaming": "Streaming", + "web ui": "Web UI", + "windows": "Windows", + "yarn": "YARN" +} + +# Translate issue types using a format appropriate for writing contributions +# If an unknown issue type is encountered, warn the user +def translate_issue_type(issue_type, issue_id, warnings): + issue_type = issue_type.lower() + if issue_type in known_issue_types: + return known_issue_types[issue_type] + else: + warnings.append("Unknown issue type \"%s\" (see %s)" % (issue_type, issue_id)) + return issue_type + +# Translate component names using a format appropriate for writing contributions +# If an unknown component is encountered, warn the user +def translate_component(component, commit_hash, warnings): + component = component.lower() + if component in known_components: + return known_components[component] + else: + warnings.append("Unknown component \"%s\" (see %s)" % (component, commit_hash)) + return component + +# Parse components in the commit message +# The returned components are already filtered and translated +def find_components(commit, commit_hash): + components = re.findall("\[\w*\]", commit.lower()) + components = [translate_component(c, commit_hash)\ + for c in components if c in known_components] + return components + +# Join a list of strings in a human-readable manner +# e.g. ["Juice"] -> "Juice" +# e.g. ["Juice", "baby"] -> "Juice and baby" +# e.g. ["Juice", "baby", "moon"] -> "Juice, baby, and moon" +def nice_join(str_list): + str_list = list(str_list) # sometimes it's a set + if not str_list: + return "" + elif len(str_list) == 1: + return next(iter(str_list)) + elif len(str_list) == 2: + return " and ".join(str_list) + else: + return ", ".join(str_list[:-1]) + ", and " + str_list[-1] + From f8a4fd393d2eb1a58429653050ef036ca9f5aa2d Mon Sep 17 00:00:00 2001 From: roxchkplusony Date: Fri, 28 Nov 2014 00:34:41 -0800 Subject: [PATCH 18/72] [BRANCH-1.1][SPARK-4626] Kill a task only if the executorId is (still) registered with the scheduler v1.1 backport for #3483 Author: roxchkplusony Closes #3503 from roxchkplusony/bugfix/4626-1.1 and squashes the following commits: 234d350 [roxchkplusony] [SPARK-4626] Kill a task only if the executorId is (still) registered with the scheduler --- .../scheduler/cluster/CoarseGrainedSchedulerBackend.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index e8a3a3bfa28e3..ae94b38d575d6 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -118,7 +118,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, actorSystem: A makeOffers() case KillTask(taskId, executorId, interruptThread) => - executorActor(executorId) ! KillTask(taskId, executorId, interruptThread) + if (executorActor.contains(executorId)) { + executorActor(executorId) ! KillTask(taskId, executorId, interruptThread) + } else { + // Ignoring the task kill since the executor is not registered. + logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.") + } case StopDriver => sender ! true From 24b5c03689baa8d1cbe014853bc1d0de1901dcdf Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 28 Nov 2014 18:04:05 -0800 Subject: [PATCH 19/72] [SPARK-4597] Use proper exception and reset variable in Utils.createTempDir() `File.exists()` and `File.mkdirs()` only throw `SecurityException` instead of `IOException`. Then, when an exception is thrown, `dir` should be reset too. Author: Liang-Chi Hsieh Closes #3449 from viirya/fix_createtempdir and squashes the following commits: 36cacbd [Liang-Chi Hsieh] Use proper exception and reset variable. (cherry picked from commit 49fe8797e64f10c574e0790b32a8c3fdc7e594a0) Signed-off-by: Josh Rosen --- core/src/main/scala/org/apache/spark/util/Utils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a9371c8208d13..ef11ed3e48305 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -277,7 +277,7 @@ private[spark] object Utils extends Logging { if (dir.exists() || !dir.mkdirs()) { dir = null } - } catch { case e: IOException => ; } + } catch { case e: SecurityException => dir = null; } } registerShutdownDeleteDir(dir) From 1a2508b73f6d46a0faf7740b85a5c216c925c25a Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Sun, 30 Nov 2014 11:40:08 -0800 Subject: [PATCH 20/72] SPARK-2143 [WEB UI] Add Spark version to UI footer This PR adds the Spark version number to the UI footer; this is how it looks: ![screen shot 2014-11-21 at 22 58 40](https://cloud.githubusercontent.com/assets/822522/5157738/f4822094-7316-11e4-98f1-333a535fdcfa.png) Author: Sean Owen Closes #3410 from srowen/SPARK-2143 and squashes the following commits: e9b3a7a [Sean Owen] Add Spark version to footer --- core/src/main/scala/org/apache/spark/ui/UIUtils.scala | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index a89ae241fd73d..0402d0a4f3f9c 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -204,6 +204,11 @@ private[spark] object UIUtils extends Logging { {content} + } @@ -230,6 +235,11 @@ private[spark] object UIUtils extends Logging { {content} + } From 90d90b28c5236c83233fdfce546b72145f0135c8 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Sun, 30 Nov 2014 11:48:46 -0800 Subject: [PATCH 21/72] [HOTFIX] Fix build break in 1a2508b73f6d46a0faf7740b85a5c216c925c25a org.apache.spark.SPARK_VERSION is new in 1.2; in earlier versions, we have to use SparkContext.SPARK_VERSION. --- core/src/main/scala/org/apache/spark/ui/UIUtils.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 0402d0a4f3f9c..8c437ce8f67f5 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -23,6 +23,7 @@ import java.util.{Locale, Date} import scala.xml.Node import org.apache.spark.Logging +import org.apache.spark.SparkContext /** Utility functions for generating XML pages with spark content. */ private[spark] object UIUtils extends Logging { @@ -206,7 +207,7 @@ private[spark] object UIUtils extends Logging { @@ -237,7 +238,7 @@ private[spark] object UIUtils extends Logging { From 91eadd2d8435f9cc63473da4f1121655480aaaa8 Mon Sep 17 00:00:00 2001 From: Cheng Lian Date: Sun, 30 Nov 2014 19:04:07 -0800 Subject: [PATCH 22/72] [DOC] Fixes formatting typo in SQL programming guide [Review on Reviewable](https://reviewable.io/reviews/apache/spark/3498) Author: Cheng Lian Closes #3498 from liancheng/fix-sql-doc-typo and squashes the following commits: 865ecd7 [Cheng Lian] Fixes formatting typo in SQL programming guide (cherry picked from commit 2a4d389f70b2066b1ac32b081bef44e61fefb03c) Signed-off-by: Josh Rosen --- docs/sql-programming-guide.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 90ac971751f6c..578ae09845ea6 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -892,7 +892,6 @@ export HIVE_SERVER2_THRIFT_BIND_HOST= ./sbin/start-thriftserver.sh \ --master \ ... -``` {% endhighlight %} or system properties: @@ -903,7 +902,6 @@ or system properties: --hiveconf hive.server2.thrift.bind.host= \ --master ... -``` {% endhighlight %} Now you can use beeline to test the Thrift JDBC server: From f333e4f278851e4c28197fefdee68a0e88ca0126 Mon Sep 17 00:00:00 2001 From: Kay Ousterhout Date: Tue, 2 Dec 2014 09:06:02 -0800 Subject: [PATCH 23/72] [SPARK-4686] Link to allowed master URLs is broken The link points to the old scala programming guide; it should point to the submitting applications page. This should be backported to 1.1.2 (it's been broken as of 1.0). Author: Kay Ousterhout Closes #3542 from kayousterhout/SPARK-4686 and squashes the following commits: a8fc43b [Kay Ousterhout] [SPARK-4686] Link to allowed master URLs is broken (cherry picked from commit d9a148ba6a67a01e4bf77c35c41dd4cbc8918c82) Signed-off-by: Kay Ousterhout --- docs/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 335650d11eb74..13fc251c1733f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -92,7 +92,7 @@ of the most common options to set are: (none) The cluster manager to connect to. See the list of - allowed master URL's. + allowed master URL's. From aec20af1ffb4cf209e8c1154248e5c5acbaa3a0e Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 2 Dec 2014 16:36:12 -0800 Subject: [PATCH 24/72] [Release] Translate unknown author names automatically --- dev/create-release/generate-contributors.py | 36 ++++---- dev/create-release/releaseutils.py | 93 +++++++++++++++++++++ 2 files changed, 111 insertions(+), 18 deletions(-) diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index f4bf734081583..99c29ef9ff8b6 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,23 +26,11 @@ # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) START_COMMIT = os.environ.get("START_COMMIT", "37b100") END_COMMIT = os.environ.get("END_COMMIT", "3693ae") -try: - from jira.client import JIRA -except ImportError: - print "This tool requires the jira-python library" - print "Install using 'sudo pip install jira-python'" - sys.exit(-1) - -try: - import unidecode -except ImportError: - print "This tool requires the unidecode library to decode obscure github usernames" - print "Install using 'sudo pip install unidecode'" - sys.exit(-1) - # If commit range is not specified, prompt the user to provide it if not START_COMMIT or not END_COMMIT: print "A commit range is required to proceed." @@ -52,6 +40,8 @@ END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") # Verify provided arguments +if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided") +if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided") start_commit_line = get_one_line(START_COMMIT) end_commit_line = get_one_line(END_COMMIT) num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) @@ -70,6 +60,14 @@ sys.exit("Ok, exiting") print "==================================================================================\n" +# Setup JIRA and github clients. We use two JIRA clients, one with authentication +# and one without, because authentication is slow and required only when we query +# JIRA user details but not Spark issues +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options) +jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +github_client = Github() + # Find all commits within this range print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) commits = get_one_line_commits(START_COMMIT, END_COMMIT) @@ -129,14 +127,16 @@ def print_indented(_list): # } # author_info = {} -jira_options = { "server": JIRA_API_BASE } -jira = JIRA(jira_options) print "\n=========================== Compiling contributor list ===========================" for commit in filtered_commits: commit_hash = re.findall("^[a-z0-9]+", commit)[0] issues = re.findall("SPARK-[0-9]+", commit.upper()) + # Translate the author in case the github username is not an actual name + # Also guard against any special characters used in the name + # Note the JIRA client we use here must have authentication enabled author = get_author(commit_hash) - author = unidecode.unidecode(unicode(author, "UTF-8")) # guard against special characters + author = unidecode.unidecode(unicode(author, "UTF-8")) + author = translate_author(author, github_client, jira_client_auth, warnings) date = get_date(commit_hash) # Parse components from the commit message, if any commit_components = find_components(commit, commit_hash) @@ -151,7 +151,7 @@ def populate(issue_type, components): author_info[author][issue_type].add(component) # Find issues and components associated with this commit for issue in issues: - jira_issue = jira.issue(issue) + jira_issue = jira_client.issue(issue) jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) jira_components = [translate_component(c.name, commit_hash, warnings)\ diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index e56d7fa58fa2c..0d6830b11dc73 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -21,6 +21,29 @@ import re from subprocess import Popen, PIPE +try: + from jira.client import JIRA + from jira.exceptions import JIRAError +except ImportError: + print "This tool requires the jira-python library" + print "Install using 'sudo pip install jira-python'" + sys.exit(-1) + +try: + from github import Github + from github import GithubException +except ImportError: + print "This tool requires the PyGithub library" + print "Install using 'sudo pip install PyGithub'" + sys.exit(-1) + +try: + import unidecode +except ImportError: + print "This tool requires the unidecode library to decode obscure github usernames" + print "Install using 'sudo pip install unidecode'" + sys.exit(-1) + # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] def get_author(commit_hash): @@ -122,3 +145,73 @@ def nice_join(str_list): else: return ", ".join(str_list[:-1]) + ", and " + str_list[-1] +# Return the full name of the specified user on Github +# If the user doesn't exist, return None +def get_github_name(author, github_client): + if github_client: + try: + return github_client.get_user(author).name + except GithubException as e: + # If this is not a "not found" exception + if e.status != 404: + raise e + return None + +# Return the full name of the specified user on JIRA +# If the user doesn't exist, return None +def get_jira_name(author, jira_client): + if jira_client: + try: + return jira_client.user(author).displayName + except JIRAError as e: + # If this is not a "not found" exception + if e.status_code != 404: + raise e + return None + +# Return whether the given name is in the form +def is_valid_author(author): + if not author: return False + author_words = len(author.split(" ")) + return author_words == 2 or author_words == 3 + +# Capitalize the first letter of each word in the given author name +def capitalize_author(author): + if not author: return None + words = author.split(" ") + words = [w[0].capitalize() + w[1:] for w in words if w] + return " ".join(words) + +# Maintain a mapping of translated author names as a cache +translated_authors = {} + +# Format the given author in a format appropriate for the contributors list. +# If the author is not an actual name, search github and JIRA for potential +# replacements and log all candidates as a warning. +def translate_author(github_author, github_client, jira_client, warnings): + if is_valid_author(github_author): + return capitalize_author(github_author) + # If the translated author is already cached, just return it + if github_author in translated_authors: + return translated_authors[github_author] + # Otherwise, author name is not found, so we need to search for an alternative name + candidates = set() + github_name = get_github_name(github_author, github_client) + jira_name = get_jira_name(github_author, jira_client) + if is_valid_author(github_name): github_name = capitalize_author(github_name) + if is_valid_author(jira_name): jira_name = capitalize_author(jira_name) + if github_name: candidates.add(github_name) + if jira_name: candidates.add(jira_name) + # Only use the github name as a replacement automatically + # The JIRA name may not make sense because it can belong to someone else + if is_valid_author(github_name): + candidates_message = " (another candidate is %s)" % jira_name if jira_name else "" + warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message)) + translated_authors[github_name] = github_name + return translated_authors[github_name] + # No direct replacement, so return the original author and list any candidates found + candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else "" + warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message)) + translated_authors[github_author] = github_author + return translated_authors[github_author] + From e484b8af2ef59013ee977f9ae3143d258f21786b Mon Sep 17 00:00:00 2001 From: Masayoshi TSUZUKI Date: Wed, 3 Dec 2014 12:08:00 -0800 Subject: [PATCH 25/72] [SPARK-4701] Typo in sbt/sbt Modified typo. Author: Masayoshi TSUZUKI Closes #3560 from tsudukim/feature/SPARK-4701 and squashes the following commits: ed2a3f1 [Masayoshi TSUZUKI] Another whitespace position error. 1af3a35 [Masayoshi TSUZUKI] [SPARK-4701] Typo in sbt/sbt (cherry picked from commit 96786e3ee53a13a57463b74bec0e77b172f719a3) Signed-off-by: Andrew Or --- sbt/sbt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sbt/sbt b/sbt/sbt index 1b1aa1483a829..7790aa4fe8cf6 100755 --- a/sbt/sbt +++ b/sbt/sbt @@ -71,8 +71,8 @@ Usage: $script_name [options] -Dkey=val pass -Dkey=val directly to the java runtime -J-X pass option -X directly to the java runtime (-J is stripped) - -S-X add -X to sbt's scalacOptions (-J is stripped) - -PmavenProfiles Enable a maven profile for the build. + -S-X add -X to sbt's scalacOptions (-S is stripped) + -PmavenProfiles Enable a maven profile for the build. In the case of duplicated or conflicting options, the order above shows precedence: JAVA_OPTS lowest, command line options highest. From af7695456e04dc92dc47d55ab169c623e3f29d23 Mon Sep 17 00:00:00 2001 From: zsxwing Date: Wed, 3 Dec 2014 12:19:40 -0800 Subject: [PATCH 26/72] [SPARK-4715][Core] Make sure tryToAcquire won't return a negative value ShuffleMemoryManager.tryToAcquire may return a negative value. The unit test demonstrates this bug. It will output `0 did not equal -200 granted is negative`. Author: zsxwing Closes #3575 from zsxwing/SPARK-4715 and squashes the following commits: a193ae6 [zsxwing] Make sure tryToAcquire won't return a negative value --- .../spark/shuffle/ShuffleMemoryManager.scala | 5 +++-- .../shuffle/ShuffleMemoryManagerSuite.scala | 17 ++++++++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala index c746e138b63c9..d82bb248fdcdc 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala @@ -66,8 +66,9 @@ private[spark] class ShuffleMemoryManager(maxMemory: Long) extends Logging { val curMem = threadMemory(threadId) val freeMemory = maxMemory - threadMemory.values.sum - // How much we can grant this thread; don't let it grow to more than 1 / numActiveThreads - val maxToGrant = math.min(numBytes, (maxMemory / numActiveThreads) - curMem) + // How much we can grant this thread; don't let it grow to more than 1 / numActiveThreads; + // don't let it be negative + val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveThreads) - curMem)) if (curMem < maxMemory / (2 * numActiveThreads)) { // We want to let each thread get at least 1 / (2 * numActiveThreads) before blocking; diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala index d31bc22ee74f7..e0e646f0a3652 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala @@ -159,7 +159,7 @@ class ShuffleMemoryManagerSuite extends FunSuite with Timeouts { test("threads can block to get at least 1 / 2N memory") { // t1 grabs 1000 bytes and then waits until t2 is ready to make a request. It sleeps - // for a bit and releases 250 bytes, which should then be greanted to t2. Further requests + // for a bit and releases 250 bytes, which should then be granted to t2. Further requests // by t2 will return false right away because it now has 1 / 2N of the memory. val manager = new ShuffleMemoryManager(1000L) @@ -291,4 +291,19 @@ class ShuffleMemoryManagerSuite extends FunSuite with Timeouts { assert(state.t2WaitTime > 200, s"t2 waited less than 200 ms (${state.t2WaitTime})") } } + + test("threads should not be granted a negative size") { + val manager = new ShuffleMemoryManager(1000L) + manager.tryToAcquire(700L) + + val latch = new CountDownLatch(1) + startThread("t1") { + manager.tryToAcquire(300L) + latch.countDown() + } + latch.await() // Wait until `t1` calls `tryToAcquire` + + val granted = manager.tryToAcquire(300L) + assert(0 === granted, "granted is negative") + } } From 3e3cd5ac829b9d75bbbf837721ec7252c005862d Mon Sep 17 00:00:00 2001 From: Masayoshi TSUZUKI Date: Wed, 3 Dec 2014 13:16:24 -0800 Subject: [PATCH 27/72] [SPARK-4642] Add description about spark.yarn.queue to running-on-YARN document. Added descriptions about these parameters. - spark.yarn.queue Modified description about the defalut value of this parameter. - spark.yarn.submit.file.replication Author: Masayoshi TSUZUKI Closes #3500 from tsudukim/feature/SPARK-4642 and squashes the following commits: ce99655 [Masayoshi TSUZUKI] better gramatically. 21cf624 [Masayoshi TSUZUKI] Removed intentionally undocumented properties. 88cac9b [Masayoshi TSUZUKI] [SPARK-4642] Documents about running-on-YARN needs update --- docs/running-on-yarn.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 63d28b8966d70..a27873c94f36c 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -30,7 +30,7 @@ Most of the configs are the same for Spark on YARN as for other deployment modes spark.yarn.submit.file.replication - 3 + The default HDFS replication (usually 3) HDFS replication level for the files uploaded into HDFS for the application. These include things like the Spark jar, the app jar, and any distributed cache files/archives. @@ -91,6 +91,13 @@ Most of the configs are the same for Spark on YARN as for other deployment modes The amount of off heap memory (in megabytes) to be allocated per driver. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. + + spark.yarn.queue + default + + The name of the YARN queue to which the application is submitted. + + spark.yarn.jar (none) From 17dfd415f90786a97c5e9b4c8c2b5fdf98aa3ff6 Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Wed, 3 Dec 2014 15:08:01 -0800 Subject: [PATCH 28/72] [SPARK-4498][core] Don't transition ExecutorInfo to RUNNING until Driver adds Executor The ExecutorInfo only reaches the RUNNING state if the Driver is alive to send the ExecutorStateChanged message to master. Else, appInfo.resetRetryCount() is never called and failing Executors will eventually exceed ApplicationState.MAX_NUM_RETRY, resulting in the application being removed from the master's accounting. Author: Mark Hamstra Closes #3550 from markhamstra/SPARK-4498 and squashes the following commits: 8f543b1 [Mark Hamstra] Don't transition ExecutorInfo to RUNNING until Executor is added by Driver --- .../main/scala/org/apache/spark/deploy/client/AppClient.scala | 1 + .../scala/org/apache/spark/deploy/worker/ExecutorRunner.scala | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala index 32790053a6be8..14e340b3a972e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala @@ -134,6 +134,7 @@ private[spark] class AppClient( val fullId = appId + "/" + id logInfo("Executor added: %s on %s (%s) with %d cores".format(fullId, workerId, hostPort, cores)) + master ! ExecutorStateChanged(appId, id, ExecutorState.RUNNING, None, None) listener.executorAdded(fullId, workerId, hostPort, cores, memory) case ExecutorUpdated(id, state, message, exitStatus) => diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala index 00a43673e5cd3..7be89f9aff0f3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala @@ -159,8 +159,6 @@ private[spark] class ExecutorRunner( Files.write(header, stderr, Charsets.UTF_8) stderrAppender = FileAppender(process.getErrorStream, stderr, conf) - state = ExecutorState.RUNNING - worker ! ExecutorStateChanged(appId, execId, state, None, None) // Wait for it to exit; executor may exit with code 0 (when driver instructs it to shutdown) // or with nonzero exit code val exitCode = process.waitFor() From 6c532254a97f5e1ed146ca8dba14b822dc47ebac Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 3 Dec 2014 19:08:29 -0800 Subject: [PATCH 29/72] [Release] Correctly translate contributors name in release notes This commit involves three main changes: (1) It separates the translation of contributor names from the generation of the contributors list. This is largely motivated by the Github API limit; even if we exceed this limit, we should at least be able to proceed manually as before. This is why the translation logic is abstracted into its own script translate-contributors.py. (2) When we look for candidate replacements for invalid author names, we should look for the assignees of the associated JIRAs too. As a result, the intermediate file must keep track of these. (3) This provides an interactive mode with which the user can sit at the terminal and manually pick the candidate replacement that he/she thinks makes the most sense. As before, there is a non-interactive mode that picks the first candidate that the script considers "valid." TODO: We should have a known_contributors file that stores known mappings so we don't have to go through all of this translation every time. This is also valuable because some contributors simply cannot be automatically translated. Conflicts: .gitignore --- .gitignore | 5 +- dev/create-release/generate-contributors.py | 52 +++-- dev/create-release/releaseutils.py | 39 +--- dev/create-release/translate-contributors.py | 190 +++++++++++++++++++ 4 files changed, 230 insertions(+), 56 deletions(-) create mode 100755 dev/create-release/translate-contributors.py diff --git a/.gitignore b/.gitignore index 7ec8d45e12c6b..ed792506534c3 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ *.ipr *.iml *.iws +*.pyc .idea/ sbt/*.jar .settings @@ -45,7 +46,9 @@ dependency-reduced-pom.xml checkpoint derby.log dist/ -spark-*-bin.tar.gz +dev/create-release/*txt +dev/create-release/*new +spark-*-bin-*.tgz unit-tests.log /lib/ rat-results.txt diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index 99c29ef9ff8b6..a3b78a3eac6d0 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,8 +26,6 @@ # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) -JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) START_COMMIT = os.environ.get("START_COMMIT", "37b100") END_COMMIT = os.environ.get("END_COMMIT", "3693ae") @@ -40,8 +38,6 @@ END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") # Verify provided arguments -if not JIRA_USERNAME: sys.exit("JIRA_USERNAME must be provided") -if not JIRA_PASSWORD: sys.exit("JIRA_PASSWORD must be provided") start_commit_line = get_one_line(START_COMMIT) end_commit_line = get_one_line(END_COMMIT) num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) @@ -60,14 +56,6 @@ sys.exit("Ok, exiting") print "==================================================================================\n" -# Setup JIRA and github clients. We use two JIRA clients, one with authentication -# and one without, because authentication is slow and required only when we query -# JIRA user details but not Spark issues -jira_options = { "server": JIRA_API_BASE } -jira_client = JIRA(options = jira_options) -jira_client_auth = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) -github_client = Github() - # Find all commits within this range print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) commits = get_one_line_commits(START_COMMIT, END_COMMIT) @@ -105,13 +93,17 @@ def print_indented(_list): if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts) if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) print "==================== Warning: the above commits will be ignored ==================\n" -response = raw_input("%d commits left to process. Ok to proceed? [y/N] " % len(filtered_commits)) -if response.lower() != "y": +response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits)) +if response.lower() != "y" and response: sys.exit("Ok, exiting.") # Keep track of warnings to tell the user at the end warnings = [] +# Mapping from the invalid author name to its associated JIRA issues +# E.g. andrewor14 -> set("SPARK-2413", "SPARK-3551", "SPARK-3471") +invalid_authors = {} + # Populate a map that groups issues and components by author # It takes the form: Author name -> { Contribution type -> Spark components } # For instance, @@ -127,16 +119,23 @@ def print_indented(_list): # } # author_info = {} +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options) print "\n=========================== Compiling contributor list ===========================" for commit in filtered_commits: commit_hash = re.findall("^[a-z0-9]+", commit)[0] issues = re.findall("SPARK-[0-9]+", commit.upper()) - # Translate the author in case the github username is not an actual name - # Also guard against any special characters used in the name - # Note the JIRA client we use here must have authentication enabled author = get_author(commit_hash) - author = unidecode.unidecode(unicode(author, "UTF-8")) - author = translate_author(author, github_client, jira_client_auth, warnings) + author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters + # If the author name is invalid, keep track of it along + # with all associated issues so we can translate it later + if is_valid_author(author): + author = capitalize_author(author) + else: + if author not in invalid_authors: + invalid_authors[author] = set() + for issue in issues: + invalid_authors[author].add(issue) date = get_date(commit_hash) # Parse components from the commit message, if any commit_components = find_components(commit, commit_hash) @@ -147,7 +146,7 @@ def populate(issue_type, components): author_info[author] = {} if issue_type not in author_info[author]: author_info[author][issue_type] = set() - for component in all_components: + for component in components: author_info[author][issue_type].add(component) # Find issues and components associated with this commit for issue in issues: @@ -168,7 +167,6 @@ def populate(issue_type, components): # Each line takes the format "Author name - semi-colon delimited contributions" # e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core # e.g. Tathagata Das - Bug fixes and new features in Streaming -contributors_file_name = "contributors.txt" contributors_file = open(contributors_file_name, "w") authors = author_info.keys() authors.sort() @@ -192,11 +190,23 @@ def populate(issue_type, components): # Do not use python's capitalize() on the whole string to preserve case assert contribution contribution = contribution[0].capitalize() + contribution[1:] + # If the author name is invalid, use an intermediate format that + # can be translated through translate-contributors.py later + # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 + if author in invalid_authors and invalid_authors[author]: + author = author + "/" + "/".join(invalid_authors[author]) line = "%s - %s" % (author, contribution) contributors_file.write(line + "\n") contributors_file.close() print "Contributors list is successfully written to %s!" % contributors_file_name +# Prompt the user to translate author names if necessary +if invalid_authors: + warnings.append("Found the following invalid authors:") + for a in invalid_authors: + warnings.append("\t%s" % a) + warnings.append("Please run './translate-contributors.py' to translate them.") + # Log any warnings encountered in the process if warnings: print "\n============ Warnings encountered while creating the contributor list ============" diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 0d6830b11dc73..76a10c32886d4 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -44,6 +44,9 @@ print "Install using 'sudo pip install unidecode'" sys.exit(-1) +# Contributors list file name +contributors_file_name = "contributors.txt" + # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] def get_author(commit_hash): @@ -69,7 +72,8 @@ def num_commits_in_range(start_hash, end_hash): "build": "build fixes", "improvement": "improvements", "new feature": "new features", - "documentation": "documentation" + "documentation": "documentation", + "test": "test" } # Maintain a mapping for translating component names when creating the release notes @@ -182,36 +186,3 @@ def capitalize_author(author): words = [w[0].capitalize() + w[1:] for w in words if w] return " ".join(words) -# Maintain a mapping of translated author names as a cache -translated_authors = {} - -# Format the given author in a format appropriate for the contributors list. -# If the author is not an actual name, search github and JIRA for potential -# replacements and log all candidates as a warning. -def translate_author(github_author, github_client, jira_client, warnings): - if is_valid_author(github_author): - return capitalize_author(github_author) - # If the translated author is already cached, just return it - if github_author in translated_authors: - return translated_authors[github_author] - # Otherwise, author name is not found, so we need to search for an alternative name - candidates = set() - github_name = get_github_name(github_author, github_client) - jira_name = get_jira_name(github_author, jira_client) - if is_valid_author(github_name): github_name = capitalize_author(github_name) - if is_valid_author(jira_name): jira_name = capitalize_author(jira_name) - if github_name: candidates.add(github_name) - if jira_name: candidates.add(jira_name) - # Only use the github name as a replacement automatically - # The JIRA name may not make sense because it can belong to someone else - if is_valid_author(github_name): - candidates_message = " (another candidate is %s)" % jira_name if jira_name else "" - warnings.append("Replacing github user %s with %s%s" % (github_author, github_name, candidates_message)) - translated_authors[github_name] = github_name - return translated_authors[github_name] - # No direct replacement, so return the original author and list any candidates found - candidates_message = " (candidates: %s)" % nice_join(candidates) if candidates else "" - warnings.append("Unable to find a replacement for github user %s%s" % (github_author, candidates_message)) - translated_authors[github_author] = github_author - return translated_authors[github_author] - diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py new file mode 100755 index 0000000000000..ef4625b003cb6 --- /dev/null +++ b/dev/create-release/translate-contributors.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script translates invalid authors in the contributors list generated +# by generate-contributors.py. When the script encounters an author name that +# is considered invalid, it searches Github and JIRA in an attempt to search +# for replacements. This tool runs in two modes: +# +# (1) Interactive mode: For each invalid author name, this script presents +# all candidate replacements to the user and awaits user response. In this +# mode, the user may also input a custom name. This is the default. +# +# (2) Non-interactive mode: For each invalid author name, this script replaces +# the name with the first valid candidate it can find. If there is none, it +# uses the original name. This can be enabled through the --non-interactive flag. + +import os +import sys + +from releaseutils import * + +# You must set the following before use! +JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") +JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) +if not JIRA_USERNAME or not JIRA_PASSWORD: + sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set") + +# Write new contributors list to .new +if not os.path.isfile(contributors_file_name): + print "Contributors file %s does not exist!" % contributors_file_name + print "Have you run ./generate-contributors.py yet?" + sys.exit(1) +contributors_file = open(contributors_file_name, "r") +new_contributors_file_name = contributors_file_name + ".new" +new_contributors_file = open(new_contributors_file_name, "w") +warnings = [] + +# In non-interactive mode, this script will choose the first replacement that is valid +INTERACTIVE_MODE = True +if len(sys.argv) > 1: + options = set(sys.argv[1:]) + if "--non-interactive" in options: + INTERACTIVE_MODE = False +if INTERACTIVE_MODE: + print "Running in interactive mode. To disable this, provide the --non-interactive flag." + +# Setup Github and JIRA clients +jira_options = { "server": JIRA_API_BASE } +jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) +github_client = Github() + +# Generate candidates for the given author. This should only be called if the given author +# name does not represent a full name as this operation is somewhat expensive. Under the +# hood, it makes several calls to the Github and JIRA API servers to find the candidates. +# +# This returns a list of (candidate name, source) 2-tuples. E.g. +# [ +# (NOT_FOUND, "No full name found for Github user andrewor14"), +# ("Andrew Or", "Full name of JIRA user andrewor14"), +# ("Andrew Orso", "Full name of SPARK-1444 assignee andrewor14"), +# ("Andrew Ordall", "Full name of SPARK-1663 assignee andrewor14"), +# (NOT_FOUND, "No assignee found for SPARK-1763") +# ] +NOT_FOUND = "Not found" +def generate_candidates(author, issues): + candidates = [] + # First check for full name of Github user + github_name = get_github_name(new_author, github_client) + if github_name: + candidates.append((github_name, "Full name of Github user %s" % new_author)) + else: + candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author)) + # Then do the same for JIRA user + jira_name = get_jira_name(new_author, jira_client) + if jira_name: + candidates.append((jira_name, "Full name of JIRA user %s" % new_author)) + else: + candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author)) + # Then do the same for the assignee of each of the associated JIRAs + # Note that a given issue may not have an assignee, or the assignee may not have a full name + for issue in issues: + jira_issue = jira_client.issue(issue) + jira_assignee = jira_issue.fields.assignee + if jira_assignee: + user_name = jira_assignee.name + display_name = jira_assignee.displayName + if display_name: + candidates.append((display_name, "Full name of %s assignee %s" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No full name found for %s assignee %" % (issue, user_name))) + else: + candidates.append((NOT_FOUND, "No assignee found for %s" % issue)) + # Guard against special characters in candidate names + # Note that the candidate name may already be in unicode (JIRA returns this) + for i, (candidate, source) in enumerate(candidates): + try: + candidate = unicode(candidate, "UTF-8") + except TypeError: + # already in unicode + pass + candidate = unidecode.unidecode(candidate).strip() + candidates[i] = (candidate, source) + return candidates + +# Translate each invalid author by searching for possible candidates from Github and JIRA +# In interactive mode, this script presents the user with a list of choices and have the user +# select from this list. Additionally, the user may also choose to enter a custom name. +# In non-interactive mode, this script picks the first valid author name from the candidates +# If no such name exists, the original name is used (without the JIRA numbers). +print "\n========================== Translating contributor list ==========================" +for line in contributors_file: + author = line.split(" - ")[0] + print "Processing author %s" % author + if not author: + print " ERROR: Expected the following format - " + print " ERROR: Actual = %s" % line + if not is_valid_author(author): + new_author = author.split("/")[0] + issues = author.split("/")[1:] + candidates = generate_candidates(new_author, issues) + # Print out potential replacement candidates along with the sources, e.g. + # [X] No full name found for Github user andrewor14 + # [0] Andrew Or - Full name of JIRA user andrewor14 + # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 + # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 + # [X] No assignee found for SPARK-1763 + # [3] Custom + candidate_names = [] + for candidate, source in candidates: + if candidate == NOT_FOUND: + print " [X] %s" % source + else: + index = len(candidate_names) + candidate_names.append(candidate) + print " [%d] %s - %s" % (index, candidate, source) + custom_index = len(candidate_names) + # In interactive mode, additionally provide "custom" option and await user response + if INTERACTIVE_MODE: + print " [%d] Custom" % custom_index + response = raw_input(" Your choice: ") + while not response.isdigit() or int(response) > custom_index: + response = raw_input(" Please enter an integer between 0 and %d: " % custom_index) + response = int(response) + if response == custom_index: + new_author = raw_input(" Please type a custom name for this author: ") + else: + new_author = candidate_names[response] + # In non-interactive mode, just pick the first candidate + else: + valid_candidate_names = [name for name, _ in candidates\ + if is_valid_author(name) and name != NOT_FOUND] + if valid_candidate_names: + new_author = valid_candidate_names[0] + # Finally, capitalize the author and replace the original one with it + # If the final replacement is still invalid, log a warning + if is_valid_author(new_author): + new_author = capitalize_author(new_author) + else: + warnings.append("Unable to find a valid name %s for author %s" % (new_author, author)) + print " * Replacing %s with %s" % (author, new_author) + line = line.replace(author, new_author) + new_contributors_file.write(line) +print "==================================================================================\n" +contributors_file.close() +new_contributors_file.close() + +print "Translated contributors list successfully written to %s!" % new_contributors_file_name + +# Log any warnings encountered in the process +if warnings: + print "\n========== Warnings encountered while translating the contributor list ===========" + for w in warnings: print w + print "Please manually correct these in the final contributors list at %s." % new_contributors_file_name + print "==================================================================================\n" + From 5ac55c8419a9063dce97c4050e5acea11cb5ed51 Mon Sep 17 00:00:00 2001 From: WangTaoTheTonic Date: Thu, 4 Dec 2014 11:52:47 -0800 Subject: [PATCH 30/72] [SPARK-4253] Ignore spark.driver.host in yarn-cluster and standalone-cluster modes In yarn-cluster and standalone-cluster modes, we don't know where driver will run until it is launched. If the `spark.driver.host` property is set on the submitting machine and propagated to the driver through SparkConf then this will lead to errors when the driver launches. This patch fixes this issue by dropping the `spark.driver.host` property in SparkSubmit when running in a cluster deploy mode. Author: WangTaoTheTonic Author: WangTao Closes #3112 from WangTaoTheTonic/SPARK4253 and squashes the following commits: ed1a25c [WangTaoTheTonic] revert unrelated formatting issue 02c4e49 [WangTao] add comment 32a3f3f [WangTaoTheTonic] ingore it in SparkSubmit instead of SparkContext 667cf24 [WangTaoTheTonic] document fix ff8d5f7 [WangTaoTheTonic] also ignore it in standalone cluster mode 2286e6b [WangTao] ignore spark.driver.host in yarn-cluster mode (cherry picked from commit 8106b1e36b2c2b9f5dc5d7252540e48cc3fc96d5) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala --- .../src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 +++++ .../spark/deploy/yarn/ApplicationMasterArguments.scala | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 68308c4f2ba37..c941d3a5b76e2 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -284,6 +284,11 @@ object SparkSubmit { sysProps.getOrElseUpdate(k, v) } + // Ignore spark.driver.host in cluster modes (see SPARK-4253 for context): + if (deployMode == CLUSTER) { + sysProps -= "spark.driver.host" + } + (childArgs, childClasspath, sysProps, childMainClass) } diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala index 424b0fb0936f2..ca34f5fb174ba 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMasterArguments.scala @@ -35,7 +35,7 @@ class ApplicationMasterArguments(val args: Array[String]) { var args = inputArgs - while (! args.isEmpty) { + while (!args.isEmpty) { // --num-workers, --worker-memory, and --worker-cores are deprecated since 1.0, // the properties with executor in their names are preferred. args match { From d01fdd3bbc8e223895b98e9ef408e28206892c24 Mon Sep 17 00:00:00 2001 From: alexdebrie Date: Thu, 4 Dec 2014 14:13:59 -0800 Subject: [PATCH 31/72] [SPARK-4745] Fix get_existing_cluster() function with multiple security groups The current get_existing_cluster() function would only find an instance belonged to a cluster if the instance's security groups == cluster_name + "-master" (or "-slaves"). This fix allows for multiple security groups by checking if the cluster_name + "-master" security group is in the list of groups for a particular instance. Author: alexdebrie Closes #3596 from alexdebrie/master and squashes the following commits: 9d51232 [alexdebrie] Fix get_existing_cluster() function with multiple security groups (cherry picked from commit 794f3aec24acb578e258532ad0590554d07958ba) Signed-off-by: Josh Rosen --- ec2/spark_ec2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 374d3a6a949e9..fef95bb8cbf82 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -475,9 +475,9 @@ def get_existing_cluster(conn, opts, cluster_name, die_on_error=True): active = [i for i in res.instances if is_active(i)] for inst in active: group_names = [g.name for g in inst.groups] - if group_names == [cluster_name + "-master"]: + if (cluster_name + "-master") in group_names: master_nodes.append(inst) - elif group_names == [cluster_name + "-slaves"]: + elif (cluster_name + "-slaves") in group_names: slave_nodes.append(inst) if any((master_nodes, slave_nodes)): print ("Found %d master(s), %d slaves" % (len(master_nodes), len(slave_nodes))) From e98aa5476151e9802045934890afd53b27762058 Mon Sep 17 00:00:00 2001 From: Saldanha Date: Thu, 4 Dec 2014 14:22:09 -0800 Subject: [PATCH 32/72] [SPARK-4459] Change groupBy type parameter from K to U Please see https://issues.apache.org/jira/browse/SPARK-4459 Author: Saldanha Closes #3327 from alokito/master and squashes the following commits: 54b1095 [Saldanha] [SPARK-4459] changed type parameter for keyBy from K to U d5f73c3 [Saldanha] [SPARK-4459] added keyBy test 316ad77 [Saldanha] SPARK-4459 changed type parameter for groupBy from K to U. 62ddd4b [Saldanha] SPARK-4459 added failing unit test (cherry picked from commit 743a889d2778f797aabc3b1e8146e7aa32b62a48) Signed-off-by: Josh Rosen --- .../apache/spark/api/java/JavaRDDLike.scala | 17 ++++---- .../java/org/apache/spark/JavaAPISuite.java | 41 +++++++++++++++++++ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index d230678238ab9..ed61726e16c4b 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -210,8 +210,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. */ - def groupBy[K](f: JFunction[T, K]): JavaPairRDD[K, JIterable[T]] = { - implicit val ctagK: ClassTag[K] = fakeClassTag + def groupBy[U](f: JFunction[T, U]): JavaPairRDD[U, JIterable[T]] = { + // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459 + implicit val ctagK: ClassTag[U] = fakeClassTag implicit val ctagV: ClassTag[JList[T]] = fakeClassTag JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f)(fakeClassTag))) } @@ -220,10 +221,11 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { * Return an RDD of grouped elements. Each group consists of a key and a sequence of elements * mapping to that key. */ - def groupBy[K](f: JFunction[T, K], numPartitions: Int): JavaPairRDD[K, JIterable[T]] = { - implicit val ctagK: ClassTag[K] = fakeClassTag + def groupBy[U](f: JFunction[T, U], numPartitions: Int): JavaPairRDD[U, JIterable[T]] = { + // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459 + implicit val ctagK: ClassTag[U] = fakeClassTag implicit val ctagV: ClassTag[JList[T]] = fakeClassTag - JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(fakeClassTag[K]))) + JavaPairRDD.fromRDD(groupByResultToJava(rdd.groupBy(f, numPartitions)(fakeClassTag[U]))) } /** @@ -458,8 +460,9 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Creates tuples of the elements in this RDD by applying `f`. */ - def keyBy[K](f: JFunction[T, K]): JavaPairRDD[K, T] = { - implicit val ctag: ClassTag[K] = fakeClassTag + def keyBy[U](f: JFunction[T, U]): JavaPairRDD[U, T] = { + // The type parameter is U instead of K in order to work around a compiler bug; see SPARK-4459 + implicit val ctag: ClassTag[U] = fakeClassTag JavaPairRDD.fromRDD(rdd.keyBy(f)) } diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index e1c13de04a0be..64b6377de8d32 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -298,6 +298,47 @@ public Boolean call(Integer x) { Assert.assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds } + @Test + public void groupByOnPairRDD() { + // Regression test for SPARK-4459 + JavaRDD rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); + Function, Boolean> areOdd = + new Function, Boolean>() { + @Override + public Boolean call(Tuple2 x) { + return (x._1() % 2 == 0) && (x._2() % 2 == 0); + } + }; + JavaPairRDD pairRDD = rdd.zip(rdd); + JavaPairRDD>> oddsAndEvens = pairRDD.groupBy(areOdd); + Assert.assertEquals(2, oddsAndEvens.count()); + Assert.assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens + Assert.assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds + + oddsAndEvens = pairRDD.groupBy(areOdd, 1); + Assert.assertEquals(2, oddsAndEvens.count()); + Assert.assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens + Assert.assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds + } + + @SuppressWarnings("unchecked") + @Test + public void keyByOnPairRDD() { + // Regression test for SPARK-4459 + JavaRDD rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); + Function, String> sumToString = + new Function, String>() { + @Override + public String call(Tuple2 x) { + return String.valueOf(x._1() + x._2()); + } + }; + JavaPairRDD pairRDD = rdd.zip(rdd); + JavaPairRDD> keyed = pairRDD.keyBy(sumToString); + Assert.assertEquals(7, keyed.count()); + Assert.assertEquals(1, (long) keyed.lookup("2").get(0)._1()); + } + @SuppressWarnings("unchecked") @Test public void cogroup() { From bf637e0414fb6a59d4b2462cf3ece6937ae591fe Mon Sep 17 00:00:00 2001 From: lewuathe Date: Thu, 4 Dec 2014 15:14:36 -0800 Subject: [PATCH 33/72] [SPARK-4652][DOCS] Add docs about spark-git-repo option There might be some cases when WIPS spark version need to be run on EC2 cluster. In order to setup this type of cluster more easily, add --spark-git-repo option description to ec2 documentation. Author: lewuathe Author: Josh Rosen Closes #3513 from Lewuathe/doc-for-development-spark-cluster and squashes the following commits: 6dae8ee [lewuathe] Wrap consistent with other descriptions cfaf9be [lewuathe] Add docs about spark-git-repo option (Editing / cleanup by Josh Rosen) (cherry picked from commit ab8177da2defab1ecd8bc0cd5a21f07be5b8d2c5) Signed-off-by: Josh Rosen --- docs/ec2-scripts.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/ec2-scripts.md b/docs/ec2-scripts.md index 156a727026790..1a588ade6baf1 100644 --- a/docs/ec2-scripts.md +++ b/docs/ec2-scripts.md @@ -76,6 +76,11 @@ another. specified version of Spark. VERSION can be a version number (e.g. "0.7.3") or a specific git hash. By default, a recent version will be used. +- `--spark-git-repo=` will let you run a custom version of + Spark that is built from the given git repository. By default, the + [Apache Github mirror](https://github.com/apache/spark) will be used. + When using a custom Spark version, `--spark-version` must be set to git + commit hash, such as 317e114, instead of a version number. - If one of your launches fails due to e.g. not having the right permissions on your private key file, you can run `launch` with the `--resume` option to restart the setup process on an existing cluster. From b09382a46437db728397f4515168e8f5d6105c38 Mon Sep 17 00:00:00 2001 From: Masayoshi TSUZUKI Date: Thu, 4 Dec 2014 18:13:23 -0800 Subject: [PATCH 34/72] [SPARK-4421] Wrong link in spark-standalone.html Modified the link of building Spark. (backport version of #3279.) Author: Masayoshi TSUZUKI Closes #3280 from tsudukim/feature/SPARK-4421-2 and squashes the following commits: 3b4d38d [Masayoshi TSUZUKI] [SPARK-4421] Wrong link in spark-standalone.html --- docs/spark-standalone.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 333ad5bc72061..88ab4f9f142e5 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -10,7 +10,7 @@ In addition to running on the Mesos or YARN cluster managers, Spark also provide # Installing Spark Standalone to a Cluster -To install Spark Standalone mode, you simply place a compiled version of Spark on each node on the cluster. You can obtain pre-built versions of Spark with each release or [build it yourself](index.html#building). +To install Spark Standalone mode, you simply place a compiled version of Spark on each node on the cluster. You can obtain pre-built versions of Spark with each release or [build it yourself](building-with-maven.html). # Starting a Cluster Manually From 8ee2d1873d39a68aff87efe6dbd90b65a5fb1947 Mon Sep 17 00:00:00 2001 From: Andy Konwinski Date: Thu, 4 Dec 2014 18:27:02 -0800 Subject: [PATCH 35/72] Fix typo in Spark SQL docs. Author: Andy Konwinski Closes #3611 from andyk/patch-3 and squashes the following commits: 7bab333 [Andy Konwinski] Fix typo in Spark SQL docs. (cherry picked from commit 15cf3b0125fe238dea2ce13e703034ba7cef477f) Signed-off-by: Josh Rosen --- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 578ae09845ea6..6f5fb42eb8d20 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -278,7 +278,7 @@ performed on JSON files. from pyspark.sql import SQLContext, Row sqlContext = SQLContext(sc) -# Load a text file and convert each line to a dictionary. +# Load a text file and convert each line to a Row. lines = sc.textFile("examples/src/main/resources/people.txt") parts = lines.map(lambda l: l.split(",")) people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) From 16bc77b7d91d9b21b9be45d346c47443386295d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christophe=20Pr=C3=A9aud?= Date: Mon, 8 Dec 2014 11:44:54 -0800 Subject: [PATCH 36/72] [SPARK-4764] Ensure that files are fetched atomically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tempFile is created in the same directory than targetFile, so that the move from tempFile to targetFile is always atomic Author: Christophe Préaud Closes #2855 from preaudc/master and squashes the following commits: 9ba89ca [Christophe Préaud] Ensure that files are fetched atomically 54419ae [Christophe Préaud] Merge remote-tracking branch 'upstream/master' c6a5590 [Christophe Préaud] Revert commit 8ea871f8130b2490f1bad7374a819bf56f0ccbbd 7456a33 [Christophe Préaud] Merge remote-tracking branch 'upstream/master' 8ea871f [Christophe Préaud] Ensure that files are fetched atomically (cherry picked from commit ab2abcb5ef925f15fa0e08d34a79b94a7b6578ef) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/util/Utils.scala --- core/src/main/scala/org/apache/spark/util/Utils.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index ef11ed3e48305..20bf36e99e785 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -378,8 +378,7 @@ private[spark] object Utils extends Logging { */ def fetchFile(url: String, targetDir: File, conf: SparkConf, securityMgr: SecurityManager) { val filename = url.split("/").last - val tempDir = getLocalDir(conf) - val tempFile = File.createTempFile("fetchFileTemp", null, new File(tempDir)) + val tempFile = File.createTempFile("fetchFileTemp", null, new File(targetDir.getAbsolutePath)) val targetFile = new File(targetDir, filename) val uri = new URI(url) val fileOverwrite = conf.getBoolean("spark.files.overwrite", false) From fe7d7a9834c953d155b3157242b722c67972eec5 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Mon, 8 Dec 2014 16:13:03 -0800 Subject: [PATCH 37/72] SPARK-3926 [CORE] Reopened: result of JavaRDD collectAsMap() is not serializable My original 'fix' didn't fix at all. Now, there's a unit test to check whether it works. Of the two options to really fix it -- copy the `Map` to a `java.util.HashMap`, or copy and modify Scala's implementation in `Wrappers.MapWrapper`, I went with the latter. Author: Sean Owen Closes #3587 from srowen/SPARK-3926 and squashes the following commits: 8586bb9 [Sean Owen] Remove unneeded no-arg constructor, and add additional note about copied code in LICENSE 7bb0e66 [Sean Owen] Make SerializableMapWrapper actually serialize, and add unit test (cherry picked from commit e829bfa1ab9b68f44c489d26efb042f793fd9362) Signed-off-by: Josh Rosen --- LICENSE | 3 +- .../org/apache/spark/api/java/JavaUtils.scala | 62 ++++++++++++++++++- .../java/org/apache/spark/JavaAPISuite.java | 13 ++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/LICENSE b/LICENSE index e9a1153fdc5db..ad120d8ed7602 100644 --- a/LICENSE +++ b/LICENSE @@ -363,7 +363,8 @@ THE SOFTWARE. ======================================================================== For Scala Interpreter classes (all .scala files in repl/src/main/scala -except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala): +except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala), +and for SerializableMapWrapper in JavaUtils.scala: ======================================================================== Copyright (c) 2002-2013 EPFL diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala index b52d0a5028e84..86e94931300f8 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala @@ -19,7 +19,8 @@ package org.apache.spark.api.java import com.google.common.base.Optional -import scala.collection.convert.Wrappers.MapWrapper +import java.{util => ju} +import scala.collection.mutable private[spark] object JavaUtils { def optionToOptional[T](option: Option[T]): Optional[T] = @@ -32,7 +33,64 @@ private[spark] object JavaUtils { def mapAsSerializableJavaMap[A, B](underlying: collection.Map[A, B]) = new SerializableMapWrapper(underlying) + // Implementation is copied from scala.collection.convert.Wrappers.MapWrapper, + // but implements java.io.Serializable. It can't just be subclassed to make it + // Serializable since the MapWrapper class has no no-arg constructor. This class + // doesn't need a no-arg constructor though. class SerializableMapWrapper[A, B](underlying: collection.Map[A, B]) - extends MapWrapper(underlying) with java.io.Serializable + extends ju.AbstractMap[A, B] with java.io.Serializable { self => + override def size = underlying.size + + override def get(key: AnyRef): B = try { + underlying get key.asInstanceOf[A] match { + case None => null.asInstanceOf[B] + case Some(v) => v + } + } catch { + case ex: ClassCastException => null.asInstanceOf[B] + } + + override def entrySet: ju.Set[ju.Map.Entry[A, B]] = new ju.AbstractSet[ju.Map.Entry[A, B]] { + def size = self.size + + def iterator = new ju.Iterator[ju.Map.Entry[A, B]] { + val ui = underlying.iterator + var prev : Option[A] = None + + def hasNext = ui.hasNext + + def next() = { + val (k, v) = ui.next + prev = Some(k) + new ju.Map.Entry[A, B] { + import scala.util.hashing.byteswap32 + def getKey = k + def getValue = v + def setValue(v1 : B) = self.put(k, v1) + override def hashCode = byteswap32(k.hashCode) + (byteswap32(v.hashCode) << 16) + override def equals(other: Any) = other match { + case e: ju.Map.Entry[_, _] => k == e.getKey && v == e.getValue + case _ => false + } + } + } + + def remove() { + prev match { + case Some(k) => + underlying match { + case mm: mutable.Map[a, _] => + mm remove k + prev = None + case _ => + throw new UnsupportedOperationException("remove") + } + case _ => + throw new IllegalStateException("next must be called at least once before remove") + } + } + } + } + } } diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java index 64b6377de8d32..9b651ee95dfaa 100644 --- a/core/src/test/java/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java @@ -1256,6 +1256,19 @@ public Tuple2 call(Integer x) { pairRDD.collectAsMap(); // Used to crash with ClassCastException } + @SuppressWarnings("unchecked") + @Test + public void collectAsMapAndSerialize() throws Exception { + JavaPairRDD rdd = + sc.parallelizePairs(Arrays.asList(new Tuple2("foo", 1))); + Map map = rdd.collectAsMap(); + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + new ObjectOutputStream(bytes).writeObject(map); + Map deserializedMap = (Map) + new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())).readObject(); + Assert.assertEquals(1, deserializedMap.get("foo").intValue()); + } + @Test @SuppressWarnings("unchecked") public void sampleByKey() { From 9b99237441c355179dbea2662c5baef0cef82e5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?hushan=5B=E8=83=A1=E7=8F=8A=5D?= Date: Tue, 9 Dec 2014 15:11:20 -0800 Subject: [PATCH 38/72] [SPARK-4714] BlockManager.dropFromMemory() should check whether block has been removed after synchronizing on BlockInfo instance. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After synchronizing on the `info` lock in the `removeBlock`/`dropOldBlocks`/`dropFromMemory` methods in BlockManager, the block that `info` represented may have already removed. The three methods have the same logic to get the `info` lock: ``` info = blockInfo.get(id) if (info != null) { info.synchronized { // do something } } ``` So, there is chance that when a thread enters the `info.synchronized` block, `info` has already been removed from the `blockInfo` map by some other thread who entered `info.synchronized` first. The `removeBlock` and `dropOldBlocks` methods are idempotent, so it's safe for them to run on blocks that have already been removed. But in `dropFromMemory` it may be problematic since it may drop block data which already removed into the diskstore, and this calls data store operations that are not designed to handle missing blocks. This patch fixes this issue by adding a check to `dropFromMemory` to test whether blocks have been removed by a racing thread. Author: hushan[胡珊] Closes #3574 from suyanNone/refine-block-concurrency and squashes the following commits: edb989d [hushan[胡珊]] Refine code style and comments position 55fa4ba [hushan[胡珊]] refine code e57e270 [hushan[胡珊]] add check info is already remove or not while having gotten info.syn (cherry picked from commit 30dca924df0efbdc1b638fa7c705fe8743570783) Signed-off-by: Josh Rosen --- .../main/scala/org/apache/spark/storage/BlockManager.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 3113d4a3149fc..881b95633831b 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -975,8 +975,10 @@ private[spark] class BlockManager( // If we get here, the block write failed. logWarning(s"Block $blockId was marked as failure. Nothing to drop") return None + } else if (blockInfo.get(blockId).isEmpty) { + logWarning(s"Block $blockId was already dropped.") + return None } - var blockIsUpdated = false val level = info.level From 6dcafa7ce53c4850a8d6dda138caf88cc8b640c9 Mon Sep 17 00:00:00 2001 From: Nathan Kronenfeld Date: Tue, 9 Dec 2014 23:53:17 -0800 Subject: [PATCH 39/72] [SPARK-4772] Clear local copies of accumulators as soon as we're done with them Accumulators keep thread-local copies of themselves. These copies were only cleared at the beginning of a task. This meant that (a) the memory they used was tied up until the next task ran on that thread, and (b) if a thread died, the memory it had used for accumulators was locked up forever on that worker. This PR clears the thread-local copies of accumulators at the end of each task, in the tasks finally block, to make sure they are cleaned up between tasks. It also stores them in a ThreadLocal object, so that if, for some reason, the thread dies, any memory they are using at the time should be freed up. Author: Nathan Kronenfeld Closes #3570 from nkronenfeld/Accumulator-Improvements and squashes the following commits: a581f3f [Nathan Kronenfeld] Change Accumulators to private[spark] instead of adding mima exclude to get around false positive in mima tests b6c2180 [Nathan Kronenfeld] Include MiMa exclude as per build error instructions - this version incompatibility should be irrelevent, as it will only surface if a master is talking to a worker running a different version of spark. 537baad [Nathan Kronenfeld] Fuller refactoring as intended, incorporating JR's suggestions for ThreadLocal localAccums, and keeping clear(), but also calling it in tasks' finally block, rather than just at the beginning of the task. 39a82f2 [Nathan Kronenfeld] Clear local copies of accumulators as soon as we're done with them (cherry picked from commit 94b377f94487109a1cc3e07dd230b1df7a96e28d) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/Accumulators.scala core/src/main/scala/org/apache/spark/executor/Executor.scala --- .../main/scala/org/apache/spark/Accumulators.scala | 14 ++++++++------ .../scala/org/apache/spark/executor/Executor.scala | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala index 2301caafb07ff..c9665d9fdcaaa 100644 --- a/core/src/main/scala/org/apache/spark/Accumulators.scala +++ b/core/src/main/scala/org/apache/spark/Accumulators.scala @@ -18,6 +18,7 @@ package org.apache.spark import java.io.{ObjectInputStream, Serializable} +import java.lang.ThreadLocal import scala.collection.generic.Growable import scala.collection.mutable.Map @@ -246,10 +247,12 @@ trait AccumulatorParam[T] extends AccumulableParam[T, T] { // TODO: The multi-thread support in accumulators is kind of lame; check // if there's a more intuitive way of doing it right -private object Accumulators { +private[spark] object Accumulators { // TODO: Use soft references? => need to make readObject work properly then val originals = Map[Long, Accumulable[_, _]]() - val localAccums = Map[Thread, Map[Long, Accumulable[_, _]]]() + val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() { + override protected def initialValue() = Map[Long, Accumulable[_, _]]() + } var lastId: Long = 0 def newId: Long = synchronized { @@ -261,22 +264,21 @@ private object Accumulators { if (original) { originals(a.id) = a } else { - val accums = localAccums.getOrElseUpdate(Thread.currentThread, Map()) - accums(a.id) = a + localAccums.get()(a.id) = a } } // Clear the local (non-original) accumulators for the current thread def clear() { synchronized { - localAccums.remove(Thread.currentThread) + localAccums.get.clear } } // Get the values of the local accumulators for the current thread (by ID) def values: Map[Long, Any] = synchronized { val ret = Map[Long, Any]() - for ((id, accum) <- localAccums.getOrElse(Thread.currentThread, Map())) { + for ((id, accum) <- localAccums.get) { ret(id) = accum.localValue } return ret diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index 4f49b078bdc66..fbdc7ecebba6e 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -154,7 +154,6 @@ private[spark] class Executor( try { SparkEnv.set(env) - Accumulators.clear() val (taskFiles, taskJars, taskBytes) = Task.deserializeWithDependencies(serializedTask) updateDependencies(taskFiles, taskJars) task = ser.deserialize[Task[Any]](taskBytes, Thread.currentThread.getContextClassLoader) @@ -258,6 +257,8 @@ private[spark] class Executor( env.shuffleMemoryManager.releaseMemoryForThisThread() // Release memory used by this thread for unrolling blocks env.blockManager.memoryStore.releaseUnrollMemoryForThisThread() + // Release memory used by this thread for accumulators + Accumulators.clear() runningTasks.remove(taskId) } } From 273f2c89195fa962748dc1e98bd63bfaefa5160e Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 10 Dec 2014 12:41:36 -0800 Subject: [PATCH 40/72] [SPARK-4771][Docs] Document standalone cluster supervise mode tdas looks like streaming already refers to the supervise mode. The link from there is broken though. Author: Andrew Or Closes #3627 from andrewor14/document-supervise and squashes the following commits: 9ca0908 [Andrew Or] Wording changes 2b55ed2 [Andrew Or] Document standalone cluster supervise mode --- docs/spark-standalone.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 88ab4f9f142e5..580ba6f0b2283 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -244,7 +244,7 @@ To run an interactive Spark shell against the cluster, run the following command You can also pass an option `--total-executor-cores ` to control the number of cores that spark-shell uses on the cluster. -# Launching Compiled Spark Applications +# Launching Spark Applications The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to submit a compiled Spark application to the cluster. For standalone clusters, Spark currently @@ -259,6 +259,15 @@ should specify them through the `--jars` flag using comma as a delimiter (e.g. ` To control the application's configuration or execution environment, see [Spark Configuration](configuration.html). +Additionally, standalone `cluster` mode supports restarting your application automatically if it +exited with non-zero exit code. To use this feature, you may pass in the `--supervise` flag to +`spark-submit` when launching your application. Then, if you wish to kill an application that is +failing repeatedly, you may do so through: + + ./bin/spark-class org.apache.spark.deploy.Client kill + +You can find the driver ID through the standalone Master web UI at `http://:8080`. + # Resource Scheduling The standalone cluster mode currently only supports a simple FIFO scheduler across applications. From 396de67fc648a40a53668ba85061489b61f5c2ae Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Wed, 10 Dec 2014 14:27:53 -0800 Subject: [PATCH 41/72] [SPARK-4759] Fix driver hanging from coalescing partitions The driver hangs sometimes when we coalesce RDD partitions. See JIRA for more details and reproduction. This is because our use of empty string as default preferred location in `CoalescedRDDPartition` causes the `TaskSetManager` to schedule the corresponding task on host `""` (empty string). The intended semantics here, however, is that the partition does not have a preferred location, and the TSM should schedule the corresponding task accordingly. Author: Andrew Or Closes #3633 from andrewor14/coalesce-preferred-loc and squashes the following commits: e520d6b [Andrew Or] Oops 3ebf8bd [Andrew Or] A few comments f370a4e [Andrew Or] Fix tests 2f7dfb6 [Andrew Or] Avoid using empty string as default preferred location (cherry picked from commit 4f93d0cabe5d1fc7c0fd0a33d992fd85df1fecb4) Signed-off-by: Andrew Or --- .../org/apache/spark/rdd/CoalescedRDD.scala | 36 +++++++++++-------- .../scala/org/apache/spark/rdd/RDDSuite.scala | 2 +- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala index 9fab1d78abb04..b073eba8a1574 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala @@ -35,11 +35,10 @@ import org.apache.spark.util.Utils * @param preferredLocation the preferred location for this partition */ private[spark] case class CoalescedRDDPartition( - index: Int, - @transient rdd: RDD[_], - parentsIndices: Array[Int], - @transient preferredLocation: String = "" - ) extends Partition { + index: Int, + @transient rdd: RDD[_], + parentsIndices: Array[Int], + @transient preferredLocation: Option[String] = None) extends Partition { var parents: Seq[Partition] = parentsIndices.map(rdd.partitions(_)) @throws(classOf[IOException]) @@ -55,9 +54,10 @@ private[spark] case class CoalescedRDDPartition( * @return locality of this coalesced partition between 0 and 1 */ def localFraction: Double = { - val loc = parents.count(p => - rdd.context.getPreferredLocs(rdd, p.index).map(tl => tl.host).contains(preferredLocation)) - + val loc = parents.count { p => + val parentPreferredLocations = rdd.context.getPreferredLocs(rdd, p.index).map(_.host) + preferredLocation.exists(parentPreferredLocations.contains) + } if (parents.size == 0) 0.0 else (loc.toDouble / parents.size.toDouble) } } @@ -73,9 +73,9 @@ private[spark] case class CoalescedRDDPartition( * @param balanceSlack used to trade-off balance and locality. 1.0 is all locality, 0 is all balance */ private[spark] class CoalescedRDD[T: ClassTag]( - @transient var prev: RDD[T], - maxPartitions: Int, - balanceSlack: Double = 0.10) + @transient var prev: RDD[T], + maxPartitions: Int, + balanceSlack: Double = 0.10) extends RDD[T](prev.context, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { @@ -113,7 +113,7 @@ private[spark] class CoalescedRDD[T: ClassTag]( * @return the machine most preferred by split */ override def getPreferredLocations(partition: Partition): Seq[String] = { - List(partition.asInstanceOf[CoalescedRDDPartition].preferredLocation) + partition.asInstanceOf[CoalescedRDDPartition].preferredLocation.toSeq } } @@ -147,7 +147,7 @@ private[spark] class CoalescedRDD[T: ClassTag]( * */ -private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanceSlack: Double) { +private class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanceSlack: Double) { def compare(o1: PartitionGroup, o2: PartitionGroup): Boolean = o1.size < o2.size def compare(o1: Option[PartitionGroup], o2: Option[PartitionGroup]): Boolean = @@ -341,8 +341,14 @@ private[spark] class PartitionCoalescer(maxPartitions: Int, prev: RDD[_], balanc } } -private[spark] case class PartitionGroup(prefLoc: String = "") { +private case class PartitionGroup(prefLoc: Option[String] = None) { var arr = mutable.ArrayBuffer[Partition]() - def size = arr.size } + +private object PartitionGroup { + def apply(prefLoc: String): PartitionGroup = { + require(prefLoc != "", "Preferred location must not be empty") + PartitionGroup(Some(prefLoc)) + } +} diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 96b11654a1bd7..6ac4b6af2418f 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -296,7 +296,7 @@ class RDDSuite extends FunSuite with SharedSparkContext { test("coalesced RDDs with locality") { val data3 = sc.makeRDD(List((1,List("a","c")), (2,List("a","b","c")), (3,List("b")))) val coal3 = data3.coalesce(3) - val list3 = coal3.partitions.map(p => p.asInstanceOf[CoalescedRDDPartition].preferredLocation) + val list3 = coal3.partitions.flatMap(_.asInstanceOf[CoalescedRDDPartition].preferredLocation) assert(list3.sorted === Array("a","b","c"), "Locality preferences are dropped") // RDD with locality preferences spread (non-randomly) over 6 machines, m0 through m5 From 0faea1761b05102be74a84e7225ebcf609b71832 Mon Sep 17 00:00:00 2001 From: Peter Klipfel Date: Sun, 14 Dec 2014 00:01:16 -0800 Subject: [PATCH 42/72] fixed spelling errors in documentation changed "form" to "from" in 3 documentation entries for Kafka integration Author: Peter Klipfel Closes #3691 from peterklipfel/master and squashes the following commits: 0fe7fc5 [Peter Klipfel] fixed spelling errors in documentation (cherry picked from commit 2a2983f7c5edce8e4d1d7592adcf227cbd462ae9) Signed-off-by: Josh Rosen --- .../scala/org/apache/spark/streaming/kafka/KafkaUtils.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala index 48668f763e41e..876753149651f 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala @@ -75,7 +75,7 @@ object KafkaUtils { } /** - * Create an input stream that pulls messages form a Kafka Broker. + * Create an input stream that pulls messages from a Kafka Broker. * Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2. * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..) @@ -93,7 +93,7 @@ object KafkaUtils { } /** - * Create an input stream that pulls messages form a Kafka Broker. + * Create an input stream that pulls messages from a Kafka Broker. * @param jssc JavaStreamingContext object * @param zkQuorum Zookeeper quorum (hostname:port,hostname:port,..). * @param groupId The group id for this consumer. @@ -114,7 +114,7 @@ object KafkaUtils { } /** - * Create an input stream that pulls messages form a Kafka Broker. + * Create an input stream that pulls messages from a Kafka Broker. * @param jssc JavaStreamingContext object * @param keyTypeClass Key type of RDD * @param valueTypeClass value type of RDD From fa3b3e384531ad1e8a3f6f138110c89a5ba42ec7 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Mon, 15 Dec 2014 16:06:15 -0800 Subject: [PATCH 43/72] SPARK-785 [CORE] ClosureCleaner not invoked on most PairRDDFunctions This looked like perhaps a simple and important one. `combineByKey` looks like it should clean its arguments' closures, and that in turn covers apparently all remaining functions in `PairRDDFunctions` which delegate to it. Author: Sean Owen Closes #3690 from srowen/SPARK-785 and squashes the following commits: 8df68fe [Sean Owen] Clean context of most remaining functions in PairRDDFunctions, which ultimately call combineByKey (cherry picked from commit 2a28bc61009a170af3853c78f7f36970898a6d56) Signed-off-by: Josh Rosen --- .../main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index f6d9d12fe9006..869321dac27fc 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -84,7 +84,10 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) throw new SparkException("Default partitioner cannot partition array keys.") } } - val aggregator = new Aggregator[K, V, C](createCombiner, mergeValue, mergeCombiners) + val aggregator = new Aggregator[K, V, C]( + self.context.clean(createCombiner), + self.context.clean(mergeValue), + self.context.clean(mergeCombiners)) if (self.partitioner == Some(partitioner)) { self.mapPartitionsWithContext((context, iter) => { new InterruptibleIterator(context, aggregator.combineValuesByKey(iter, context)) From 892685b37a506d034be630ae9ac58f05ef6a4d5b Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Mon, 15 Dec 2014 17:12:05 -0800 Subject: [PATCH 44/72] SPARK-4814 [CORE] Enable assertions in SBT, Maven tests / AssertionError from Hive's LazyBinaryInteger This enables assertions for the Maven and SBT build, but overrides the Hive module to not enable assertions. Author: Sean Owen Closes #3692 from srowen/SPARK-4814 and squashes the following commits: caca704 [Sean Owen] Disable assertions just for Hive f71e783 [Sean Owen] Enable assertions for SBT and Maven build (cherry picked from commit 81112e4b573292e76c7feeed995751bd7a5fe489) Signed-off-by: Josh Rosen Conflicts: pom.xml --- pom.xml | 2 +- project/SparkBuild.scala | 3 +++ sql/hive/pom.xml | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d185ccccf3686..aa776d85e45ba 100644 --- a/pom.xml +++ b/pom.xml @@ -880,7 +880,7 @@ ${project.build.directory}/surefire-reports . SparkTestSuite.txt - -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m + -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m true diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 60603cd50c8b0..98a961294b7b9 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -229,6 +229,8 @@ object Hive { lazy val settings = Seq( javaOptions += "-XX:MaxPermSize=1g", + // Specially disable assertions since some Hive tests fail them + javaOptions in Test := (javaOptions in Test).value.filterNot(_ == "-ea"), // Multiple queries rely on the TestHive singleton. See comments there for more details. parallelExecution in Test := false, // Supporting all SerDes requires us to depend on deprecated APIs, so we turn off the warnings @@ -341,6 +343,7 @@ object TestSettings { javaOptions in Test += "-Dsun.io.serialization.extendedDebugInfo=true", javaOptions in Test ++= System.getProperties.filter(_._1 startsWith "spark") .map { case (k,v) => s"-D$k=$v" }.toSeq, + javaOptions in Test += "-ea", javaOptions in Test ++= "-Xmx3g -XX:PermSize=128M -XX:MaxNewSize=256m -XX:MaxPermSize=1g" .split(" ").toSeq, javaOptions += "-Xmx3g", diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index 241e305cad986..79338318d9ff4 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -143,6 +143,10 @@ org.scalatest scalatest-maven-plugin + + + -da -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m + From 581f866d8f39009f4cb319a9ebb516c451451760 Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 16 Dec 2014 17:55:27 -0800 Subject: [PATCH 45/72] [Release] Major improvements to generate contributors script This commit introduces several major improvements to the script that generates the contributors list for release notes, notably: (1) Use release tags instead of a range of commits. Across branches, commits are not actually strictly two-dimensional, and so it is not sufficient to specify a start hash and an end hash. Otherwise, we end up counting commits that were already merged in an older branch. (2) Match PR numbers in addition to commit hashes. This is related to the first point in that if a PR is already merged in an older minor release tag, it should be filtered out here. This requires us to do some intelligent regex parsing on the commit description in addition to just relying on the GitHub API. (3) Relax author validity check. The old code fails on a name that has many middle names, for instance. The test was just too strict. (4) Use GitHub authentication. This allows us to make far more requests through the GitHub API than before (5000 as opposed to 60 per hour). (5) Translate from Github username, not commit author name. This is important because the commit author name is not always configured correctly by the user. For instance, the username "falaki" used to resolve to just "Hossein", which was treated as a github username and translated to something else that is completely arbitrary. (6) Add an option to use the untranslated name. If there is not a satisfactory candidate to replace the untranslated name with, at least allow the user to not translate it. --- dev/create-release/generate-contributors.py | 156 +++++++++++-------- dev/create-release/releaseutils.py | 94 +++++++++-- dev/create-release/translate-contributors.py | 45 ++++-- 3 files changed, 206 insertions(+), 89 deletions(-) diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index a3b78a3eac6d0..e8f81ccbce740 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,75 +26,103 @@ # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -START_COMMIT = os.environ.get("START_COMMIT", "37b100") -END_COMMIT = os.environ.get("END_COMMIT", "3693ae") - -# If commit range is not specified, prompt the user to provide it -if not START_COMMIT or not END_COMMIT: - print "A commit range is required to proceed." - if not START_COMMIT: - START_COMMIT = raw_input("Please specify starting commit hash (inclusive): ") - if not END_COMMIT: - END_COMMIT = raw_input("Please specify ending commit hash (non-inclusive): ") - -# Verify provided arguments -start_commit_line = get_one_line(START_COMMIT) -end_commit_line = get_one_line(END_COMMIT) -num_commits = num_commits_in_range(START_COMMIT, END_COMMIT) -if not start_commit_line: sys.exit("Start commit %s not found!" % START_COMMIT) -if not end_commit_line: sys.exit("End commit %s not found!" % END_COMMIT) -if num_commits == 0: - sys.exit("There are no commits in the provided range [%s, %s)" % (START_COMMIT, END_COMMIT)) +RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2") +PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0") + +# If the release tags are not provided, prompt the user to provide them +while not tag_exists(RELEASE_TAG): + RELEASE_TAG = raw_input("Please provide a valid release tag: ") +while not tag_exists(PREVIOUS_RELEASE_TAG): + print "Please specify the previous release tag." + PREVIOUS_RELEASE_TAG = raw_input(\ + "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") + +# Gather commits found in the new tag but not in the old tag. +# This filters commits based on both the git hash and the PR number. +# If either is present in the old tag, then we ignore the commit. +print "Gathering new commits between tags %s and %s" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG) +release_commits = get_commits(RELEASE_TAG) +previous_release_commits = get_commits(PREVIOUS_RELEASE_TAG) +previous_release_hashes = set() +previous_release_prs = set() +for old_commit in previous_release_commits: + previous_release_hashes.add(old_commit.get_hash()) + if old_commit.get_pr_number(): + previous_release_prs.add(old_commit.get_pr_number()) +new_commits = [] +for this_commit in release_commits: + this_hash = this_commit.get_hash() + this_pr_number = this_commit.get_pr_number() + if this_hash in previous_release_hashes: + continue + if this_pr_number and this_pr_number in previous_release_prs: + continue + new_commits.append(this_commit) +if not new_commits: + sys.exit("There are no new commits between %s and %s!" % (PREVIOUS_RELEASE_TAG, RELEASE_TAG)) + +# Prompt the user for confirmation that the commit range is correct print "\n==================================================================================" print "JIRA server: %s" % JIRA_API_BASE -print "Start commit (inclusive): %s" % start_commit_line -print "End commit (non-inclusive): %s" % end_commit_line -print "Number of commits in this range: %s" % num_commits +print "Release tag: %s" % RELEASE_TAG +print "Previous release tag: %s" % PREVIOUS_RELEASE_TAG +print "Number of commits in this range: %s" % len(new_commits) print -response = raw_input("Is this correct? [Y/n] ") -if response.lower() != "y" and response: - sys.exit("Ok, exiting") +def print_indented(_list): + for x in _list: print " %s" % x +if yesOrNoPrompt("Show all commits?"): + print_indented(new_commits) print "==================================================================================\n" - -# Find all commits within this range -print "Gathering commits within range [%s..%s)" % (START_COMMIT, END_COMMIT) -commits = get_one_line_commits(START_COMMIT, END_COMMIT) -if not commits: sys.exit("Error: No commits found within this range!") -commits = commits.split("\n") +if not yesOrNoPrompt("Does this look correct?"): + sys.exit("Ok, exiting") # Filter out special commits releases = [] +maintenance = [] reverts = [] nojiras = [] filtered_commits = [] -def is_release(commit): - return re.findall("\[release\]", commit.lower()) or\ - "maven-release-plugin" in commit or "CHANGES.txt" in commit -def has_no_jira(commit): - return not re.findall("SPARK-[0-9]+", commit.upper()) -def is_revert(commit): - return "revert" in commit.lower() -def is_docs(commit): - return re.findall("docs*", commit.lower()) or "programming guide" in commit.lower() -for c in commits: - if not c: continue - elif is_release(c): releases.append(c) - elif is_revert(c): reverts.append(c) - elif is_docs(c): filtered_commits.append(c) # docs may not have JIRA numbers - elif has_no_jira(c): nojiras.append(c) +def is_release(commit_title): + return re.findall("\[release\]", commit_title.lower()) or\ + "preparing spark release" in commit_title.lower() or\ + "preparing development version" in commit_title.lower() or\ + "CHANGES.txt" in commit_title +def is_maintenance(commit_title): + return "maintenance" in commit_title.lower() or\ + "manually close" in commit_title.lower() +def has_no_jira(commit_title): + return not re.findall("SPARK-[0-9]+", commit_title.upper()) +def is_revert(commit_title): + return "revert" in commit_title.lower() +def is_docs(commit_title): + return re.findall("docs*", commit_title.lower()) or\ + "programming guide" in commit_title.lower() +for c in new_commits: + t = c.get_title() + if not t: continue + elif is_release(t): releases.append(c) + elif is_maintenance(t): maintenance.append(c) + elif is_revert(t): reverts.append(c) + elif is_docs(t): filtered_commits.append(c) # docs may not have JIRA numbers + elif has_no_jira(t): nojiras.append(c) else: filtered_commits.append(c) # Warn against ignored commits -def print_indented(_list): - for x in _list: print " %s" % x -if releases or reverts or nojiras: +if releases or maintenance or reverts or nojiras: print "\n==================================================================================" - if releases: print "Releases (%d)" % len(releases); print_indented(releases) - if reverts: print "Reverts (%d)" % len(reverts); print_indented(reverts) - if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) + if releases: print "Found %d release commits" % len(releases) + if maintenance: print "Found %d maintenance commits" % len(maintenance) + if reverts: print "Found %d revert commits" % len(reverts) + if nojiras: print "Found %d commits with no JIRA" % len(nojiras) + print "* Warning: these commits will be ignored.\n" + if yesOrNoPrompt("Show ignored commits?"): + if releases: print "Release (%d)" % len(releases); print_indented(releases) + if maintenance: print "Maintenance (%d)" % len(maintenance); print_indented(maintenance) + if reverts: print "Revert (%d)" % len(reverts); print_indented(reverts) + if nojiras: print "No JIRA (%d)" % len(nojiras); print_indented(nojiras) print "==================== Warning: the above commits will be ignored ==================\n" -response = raw_input("%d commits left to process. Ok to proceed? [Y/n] " % len(filtered_commits)) -if response.lower() != "y" and response: +prompt_msg = "%d commits left to process after filtering. Ok to proceed?" % len(filtered_commits) +if not yesOrNoPrompt(prompt_msg): sys.exit("Ok, exiting.") # Keep track of warnings to tell the user at the end @@ -123,10 +151,11 @@ def print_indented(_list): jira_client = JIRA(options = jira_options) print "\n=========================== Compiling contributor list ===========================" for commit in filtered_commits: - commit_hash = re.findall("^[a-z0-9]+", commit)[0] - issues = re.findall("SPARK-[0-9]+", commit.upper()) - author = get_author(commit_hash) - author = unidecode.unidecode(unicode(author, "UTF-8")).strip() # guard against special characters + _hash = commit.get_hash() + title = commit.get_title() + issues = re.findall("SPARK-[0-9]+", title.upper()) + author = commit.get_author() + date = get_date(_hash) # If the author name is invalid, keep track of it along # with all associated issues so we can translate it later if is_valid_author(author): @@ -136,9 +165,8 @@ def print_indented(_list): invalid_authors[author] = set() for issue in issues: invalid_authors[author].add(issue) - date = get_date(commit_hash) - # Parse components from the commit message, if any - commit_components = find_components(commit, commit_hash) + # Parse components from the commit title, if any + commit_components = find_components(title, _hash) # Populate or merge an issue into author_info[author] def populate(issue_type, components): components = components or [CORE_COMPONENT] # assume core if no components provided @@ -153,14 +181,14 @@ def populate(issue_type, components): jira_issue = jira_client.issue(issue) jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) - jira_components = [translate_component(c.name, commit_hash, warnings)\ + jira_components = [translate_component(c.name, _hash, warnings)\ for c in jira_issue.fields.components] all_components = set(jira_components + commit_components) populate(jira_type, all_components) # For docs without an associated JIRA, manually add it ourselves - if is_docs(commit) and not issues: + if is_docs(title) and not issues: populate("documentation", commit_components) - print " Processed commit %s authored by %s on %s" % (commit_hash, author, date) + print " Processed commit %s authored by %s on %s" % (_hash, author, date) print "==================================================================================\n" # Write to contributors file ordered by author names diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 76a10c32886d4..18e16bcb90514 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -19,6 +19,7 @@ # This file contains helper methods used in creating a release. import re +import sys from subprocess import Popen, PIPE try: @@ -47,20 +48,85 @@ # Contributors list file name contributors_file_name = "contributors.txt" +# Prompt the user to answer yes or no until they do so +def yesOrNoPrompt(msg): + response = raw_input("%s [y/n]: " % msg) + while response != "y" and response != "n": + return yesOrNoPrompt(msg) + return response == "y" + # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] -def get_author(commit_hash): - return run_cmd(["git", "show", "--quiet", "--pretty=format:%an", commit_hash]) +def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1] def get_date(commit_hash): return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) -def get_one_line(commit_hash): - return run_cmd(["git", "show", "--quiet", "--pretty=format:\"%h %cd %s\"", commit_hash]) -def get_one_line_commits(start_hash, end_hash): - return run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)]) -def num_commits_in_range(start_hash, end_hash): - output = run_cmd(["git", "log", "--oneline", "%s..%s" % (start_hash, end_hash)]) - lines = [line for line in output.split("\n") if line] # filter out empty lines - return len(lines) +def tag_exists(tag): + stderr = run_cmd_error(["git", "checkout", tag]) + return "error" not in stderr + +# A type-safe representation of a commit +class Commit: + def __init__(self, _hash, author, title, pr_number = None): + self._hash = _hash + self.author = author + self.title = title + self.pr_number = pr_number + def get_hash(self): return self._hash + def get_author(self): return self.author + def get_title(self): return self.title + def get_pr_number(self): return self.pr_number + def __str__(self): + closes_pr = "(Closes #%s)" % self.pr_number if self.pr_number else "" + return "%s %s %s %s" % (self._hash, self.author, self.title, closes_pr) + +# Return all commits that belong to the specified tag. +# +# Under the hood, this runs a `git log` on that tag and parses the fields +# from the command output to construct a list of Commit objects. Note that +# because certain fields reside in the commit description and cannot be parsed +# through the Github API itself, we need to do some intelligent regex parsing +# to extract those fields. +# +# This is written using Git 1.8.5. +def get_commits(tag): + commit_start_marker = "|=== COMMIT START MARKER ===|" + commit_end_marker = "|=== COMMIT END MARKER ===|" + field_end_marker = "|=== COMMIT FIELD END MARKER ===|" + log_format =\ + commit_start_marker + "%h" +\ + field_end_marker + "%an" +\ + field_end_marker + "%s" +\ + commit_end_marker + "%b" + output = run_cmd(["git", "log", "--quiet", "--pretty=format:" + log_format, tag]) + commits = [] + raw_commits = [c for c in output.split(commit_start_marker) if c] + for commit in raw_commits: + if commit.count(commit_end_marker) != 1: + print "Commit end marker not found in commit: " + for line in commit.split("\n"): print line + sys.exit(1) + # Separate commit digest from the body + # From the digest we extract the hash, author and the title + # From the body, we extract the PR number and the github username + [commit_digest, commit_body] = commit.split(commit_end_marker) + if commit_digest.count(field_end_marker) != 2: + sys.exit("Unexpected format in commit: %s" % commit_digest) + [_hash, author, title] = commit_digest.split(field_end_marker) + # The PR number and github username is in the commit message + # itself and cannot be accessed through any Github API + pr_number = None + match = re.search("Closes #([0-9]+) from ([^/\\s]+)/", commit_body) + if match: + [pr_number, github_username] = match.groups() + # If the author name is not valid, use the github + # username so we can translate it properly later + if not is_valid_author(author): + author = github_username + # Guard against special characters + author = unidecode.unidecode(unicode(author, "UTF-8")).strip() + commit = Commit(_hash, author, title, pr_number) + commits.append(commit) + return commits # Maintain a mapping for translating issue types to contributions in the release notes # This serves an additional function of warning the user against unknown issue types @@ -70,10 +136,13 @@ def num_commits_in_range(start_hash, end_hash): known_issue_types = { "bug": "bug fixes", "build": "build fixes", + "dependency upgrade": "build fixes", "improvement": "improvements", "new feature": "new features", "documentation": "documentation", - "test": "test" + "test": "test", + "task": "improvement", + "sub-task": "improvement" } # Maintain a mapping for translating component names when creating the release notes @@ -176,8 +245,7 @@ def get_jira_name(author, jira_client): # Return whether the given name is in the form def is_valid_author(author): if not author: return False - author_words = len(author.split(" ")) - return author_words == 2 or author_words == 3 + return " " in author and not re.findall("[0-9]", author) # Capitalize the first letter of each word in the given author name def capitalize_author(author): diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index ef4625b003cb6..462c21142f75f 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -37,8 +37,11 @@ JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") JIRA_USERNAME = os.environ.get("JIRA_USERNAME", None) JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD", None) +GITHUB_API_TOKEN = os.environ.get("GITHUB_API_TOKEN", None) if not JIRA_USERNAME or not JIRA_PASSWORD: sys.exit("Both JIRA_USERNAME and JIRA_PASSWORD must be set") +if not GITHUB_API_TOKEN: + sys.exit("GITHUB_API_TOKEN must be set") # Write new contributors list to .new if not os.path.isfile(contributors_file_name): @@ -62,7 +65,7 @@ # Setup Github and JIRA clients jira_options = { "server": JIRA_API_BASE } jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) -github_client = Github() +github_client = Github(GITHUB_API_TOKEN) # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the @@ -94,7 +97,14 @@ def generate_candidates(author, issues): # Then do the same for the assignee of each of the associated JIRAs # Note that a given issue may not have an assignee, or the assignee may not have a full name for issue in issues: - jira_issue = jira_client.issue(issue) + try: + jira_issue = jira_client.issue(issue) + except JIRAError as e: + # Do not exit just because an issue is not found! + if e.status_code == 404: + warnings.append("Issue %s not found!" % issue) + continue + raise e jira_assignee = jira_issue.fields.assignee if jira_assignee: user_name = jira_assignee.name @@ -123,9 +133,10 @@ def generate_candidates(author, issues): # In non-interactive mode, this script picks the first valid author name from the candidates # If no such name exists, the original name is used (without the JIRA numbers). print "\n========================== Translating contributor list ==========================" -for line in contributors_file: +lines = contributors_file.readlines() +for i, line in enumerate(lines): author = line.split(" - ")[0] - print "Processing author %s" % author + print "Processing author %s (%d/%d)" % (author, i + 1, len(lines)) if not author: print " ERROR: Expected the following format - " print " ERROR: Actual = %s" % line @@ -135,30 +146,39 @@ def generate_candidates(author, issues): candidates = generate_candidates(new_author, issues) # Print out potential replacement candidates along with the sources, e.g. # [X] No full name found for Github user andrewor14 + # [X] No assignee found for SPARK-1763 # [0] Andrew Or - Full name of JIRA user andrewor14 # [1] Andrew Orso - Full name of SPARK-1444 assignee andrewor14 # [2] Andrew Ordall - Full name of SPARK-1663 assignee andrewor14 - # [X] No assignee found for SPARK-1763 - # [3] Custom + # [3] andrewor14 - Raw Github username + # [4] Custom candidate_names = [] + bad_prompts = [] # Prompts that can't actually be selected; print these first. + good_prompts = [] # Prompts that contain valid choices for candidate, source in candidates: if candidate == NOT_FOUND: - print " [X] %s" % source + bad_prompts.append(" [X] %s" % source) else: index = len(candidate_names) candidate_names.append(candidate) - print " [%d] %s - %s" % (index, candidate, source) - custom_index = len(candidate_names) + good_prompts.append(" [%d] %s - %s" % (index, candidate, source)) + raw_index = len(candidate_names) + custom_index = len(candidate_names) + 1 + for p in bad_prompts: print p + if bad_prompts: print " ---" + for p in good_prompts: print p # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: + print " [%d] %s - Raw Github username" % (raw_index, new_author) print " [%d] Custom" % custom_index response = raw_input(" Your choice: ") - while not response.isdigit() or int(response) > custom_index: - response = raw_input(" Please enter an integer between 0 and %d: " % custom_index) + last_index = custom_index + while not response.isdigit() or int(response) > last_index: + response = raw_input(" Please enter an integer between 0 and %d: " % last_index) response = int(response) if response == custom_index: new_author = raw_input(" Please type a custom name for this author: ") - else: + elif response != raw_index: new_author = candidate_names[response] # In non-interactive mode, just pick the first candidate else: @@ -175,6 +195,7 @@ def generate_candidates(author, issues): print " * Replacing %s with %s" % (author, new_author) line = line.replace(author, new_author) new_contributors_file.write(line) + new_contributors_file.flush() print "==================================================================================\n" contributors_file.close() new_contributors_file.close() From 991748d8b736a78e407e1dea3773fba80a8d8d3f Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 16 Dec 2014 19:28:43 -0800 Subject: [PATCH 46/72] [Release] Cache known author translations locally This bypasses unnecessary calls to the Github and JIRA API. Additionally, having a local cache allows us to remember names that we had to manually discover ourselves. --- dev/create-release/generate-contributors.py | 18 +++--- dev/create-release/known_translations | 59 ++++++++++++++++++ dev/create-release/releaseutils.py | 4 +- dev/create-release/translate-contributors.py | 64 ++++++++++++++------ 4 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 dev/create-release/known_translations diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index e8f81ccbce740..e65c5d8233098 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -26,8 +26,8 @@ # You must set the following before use! JIRA_API_BASE = os.environ.get("JIRA_API_BASE", "https://issues.apache.org/jira") -RELEASE_TAG = os.environ.get("START_COMMIT", "v1.2.0-rc2") -PREVIOUS_RELEASE_TAG = os.environ.get("END_COMMIT", "v1.1.0") +RELEASE_TAG = os.environ.get("RELEASE_TAG", "v1.2.0-rc2") +PREVIOUS_RELEASE_TAG = os.environ.get("PREVIOUS_RELEASE_TAG", "v1.1.0") # If the release tags are not provided, prompt the user to provide them while not tag_exists(RELEASE_TAG): @@ -35,7 +35,7 @@ while not tag_exists(PREVIOUS_RELEASE_TAG): print "Please specify the previous release tag." PREVIOUS_RELEASE_TAG = raw_input(\ - "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") + "For instance, if you are releasing v1.2.0, you should specify v1.1.0: ") # Gather commits found in the new tag but not in the old tag. # This filters commits based on both the git hash and the PR number. @@ -84,9 +84,9 @@ def print_indented(_list): filtered_commits = [] def is_release(commit_title): return re.findall("\[release\]", commit_title.lower()) or\ - "preparing spark release" in commit_title.lower() or\ - "preparing development version" in commit_title.lower() or\ - "CHANGES.txt" in commit_title + "preparing spark release" in commit_title.lower() or\ + "preparing development version" in commit_title.lower() or\ + "CHANGES.txt" in commit_title def is_maintenance(commit_title): return "maintenance" in commit_title.lower() or\ "manually close" in commit_title.lower() @@ -96,7 +96,7 @@ def is_revert(commit_title): return "revert" in commit_title.lower() def is_docs(commit_title): return re.findall("docs*", commit_title.lower()) or\ - "programming guide" in commit_title.lower() + "programming guide" in commit_title.lower() for c in new_commits: t = c.get_title() if not t: continue @@ -182,7 +182,7 @@ def populate(issue_type, components): jira_type = jira_issue.fields.issuetype.name jira_type = translate_issue_type(jira_type, issue, warnings) jira_components = [translate_component(c.name, _hash, warnings)\ - for c in jira_issue.fields.components] + for c in jira_issue.fields.components] all_components = set(jira_components + commit_components) populate(jira_type, all_components) # For docs without an associated JIRA, manually add it ourselves @@ -213,7 +213,7 @@ def populate(issue_type, components): # e.g. Bug fixes in MLlib, Core, and Streaming; documentation in YARN else: contributions = ["%s in %s" % (issue_type, nice_join(comps)) \ - for issue_type, comps in author_info[author].items()] + for issue_type, comps in author_info[author].items()] contribution = "; ".join(contributions) # Do not use python's capitalize() on the whole string to preserve case assert contribution diff --git a/dev/create-release/known_translations b/dev/create-release/known_translations new file mode 100644 index 0000000000000..b74e4ee8a330b --- /dev/null +++ b/dev/create-release/known_translations @@ -0,0 +1,59 @@ +# This is a mapping of names to be translated through translate-contributors.py +# The format expected on each line should be: - +CodingCat - Nan Zhu +CrazyJvm - Chao Chen +EugenCepoi - Eugen Cepoi +GraceH - Jie Huang +JerryLead - Lijie Xu +Leolh - Liu Hao +Lewuathe - Kai Sasaki +RongGu - Rong Gu +Shiti - Shiti Saxena +Victsm - Min Shen +WangTaoTheTonic - Wang Tao +XuTingjun - Tingjun Xu +YanTangZhai - Yantang Zhai +alexdebrie - Alex DeBrie +alokito - Alok Saldanha +anantasty - Anant Asthana +andrewor14 - Andrew Or +aniketbhatnagar - Aniket Bhatnagar +arahuja - Arun Ahuja +brkyvz - Burak Yavuz +chesterxgchen - Chester Chen +chiragaggarwal - Chirag Aggarwal +chouqin - Qiping Li +cocoatomo - Tomohiko K. +coderfi - Fairiz Azizi +coderxiang - Shuo Xiang +davies - Davies Liu +epahomov - Egor Pahomov +falaki - Hossein Falaki +freeman-lab - Jeremy Freeman +industrial-sloth - Jascha Swisher +jackylk - Jacky Li +jayunit100 - Jay Vyas +jerryshao - Saisai Shao +jkbradley - Joseph Bradley +lianhuiwang - Lianhui Wang +lirui-intel - Rui Li +luluorta - Lu Lu +luogankun - Gankun Luo +maji2014 - Derek Ma +mccheah - Matthew Cheah +mengxr - Xiangrui Meng +nartz - Nathan Artz +odedz - Oded Zimerman +ravipesala - Ravindra Pesala +roxchkplusony - Victor Tso +scwf - Wang Fei +shimingfei - Shiming Fei +surq - Surong Quan +suyanNone - Su Yan +tedyu - Ted Yu +tigerquoll - Dale Richardson +wangxiaojing - Xiaojing Wang +watermen - Yadong Qi +witgo - Guoqiang Li +xinyunh - Xinyun Huang +zsxwing - Shixiong Zhu diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py index 18e16bcb90514..26221b270394e 100755 --- a/dev/create-release/releaseutils.py +++ b/dev/create-release/releaseutils.py @@ -57,11 +57,11 @@ def yesOrNoPrompt(msg): # Utility functions run git commands (written with Git 1.8.5) def run_cmd(cmd): return Popen(cmd, stdout=PIPE).communicate()[0] -def run_cmd_error(cmd): return Popen(cmd, stderr=PIPE).communicate()[1] +def run_cmd_error(cmd): return Popen(cmd, stdout=PIPE, stderr=PIPE).communicate()[1] def get_date(commit_hash): return run_cmd(["git", "show", "--quiet", "--pretty=format:%cd", commit_hash]) def tag_exists(tag): - stderr = run_cmd_error(["git", "checkout", tag]) + stderr = run_cmd_error(["git", "show", tag]) return "error" not in stderr # A type-safe representation of a commit diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index 462c21142f75f..f3b1efdd42785 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -67,6 +67,19 @@ jira_client = JIRA(options = jira_options, basic_auth = (JIRA_USERNAME, JIRA_PASSWORD)) github_client = Github(GITHUB_API_TOKEN) +# Load known author translations that are cached locally +known_translations = {} +known_translations_file_name = "known_translations" +known_translations_file = open(known_translations_file_name, "r") +for line in known_translations_file: + if line.startswith("#"): continue + [old_name, new_name] = line.split(" - ") + known_translations[old_name] = new_name +known_translations_file.close() + +# Open again in case the user adds new mappings +known_translations_file = open(known_translations_file_name, "a") + # Generate candidates for the given author. This should only be called if the given author # name does not represent a full name as this operation is somewhat expensive. Under the # hood, it makes several calls to the Github and JIRA API servers to find the candidates. @@ -83,17 +96,17 @@ def generate_candidates(author, issues): candidates = [] # First check for full name of Github user - github_name = get_github_name(new_author, github_client) + github_name = get_github_name(author, github_client) if github_name: - candidates.append((github_name, "Full name of Github user %s" % new_author)) + candidates.append((github_name, "Full name of Github user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for Github user %s" % new_author)) + candidates.append((NOT_FOUND, "No full name found for Github user %s" % author)) # Then do the same for JIRA user - jira_name = get_jira_name(new_author, jira_client) + jira_name = get_jira_name(author, jira_client) if jira_name: - candidates.append((jira_name, "Full name of JIRA user %s" % new_author)) + candidates.append((jira_name, "Full name of JIRA user %s" % author)) else: - candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % new_author)) + candidates.append((NOT_FOUND, "No full name found for JIRA user %s" % author)) # Then do the same for the assignee of each of the associated JIRAs # Note that a given issue may not have an assignee, or the assignee may not have a full name for issue in issues: @@ -135,15 +148,24 @@ def generate_candidates(author, issues): print "\n========================== Translating contributor list ==========================" lines = contributors_file.readlines() for i, line in enumerate(lines): - author = line.split(" - ")[0] - print "Processing author %s (%d/%d)" % (author, i + 1, len(lines)) - if not author: - print " ERROR: Expected the following format - " - print " ERROR: Actual = %s" % line - if not is_valid_author(author): - new_author = author.split("/")[0] - issues = author.split("/")[1:] - candidates = generate_candidates(new_author, issues) + temp_author = line.split(" - ")[0] + print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) + if not temp_author: + error_msg = " ERROR: Expected the following format - \n" + error_msg += " ERROR: Actual = %s" % line + print error_msg + warnings.append(error_msg) + new_contributors_file.write(line) + new_contributors_file.flush() + continue + author = temp_author.split("/")[0] + # Use the local copy of known translations where possible + if author in known_translations: + line = line.replace(temp_author, known_translations[author]) + elif not is_valid_author(author): + new_author = author + issues = temp_author.split("/")[1:] + candidates = generate_candidates(author, issues) # Print out potential replacement candidates along with the sources, e.g. # [X] No full name found for Github user andrewor14 # [X] No assignee found for SPARK-1763 @@ -169,7 +191,7 @@ def generate_candidates(author, issues): for p in good_prompts: print p # In interactive mode, additionally provide "custom" option and await user response if INTERACTIVE_MODE: - print " [%d] %s - Raw Github username" % (raw_index, new_author) + print " [%d] %s - Raw Github username" % (raw_index, author) print " [%d] Custom" % custom_index response = raw_input(" Your choice: ") last_index = custom_index @@ -191,9 +213,15 @@ def generate_candidates(author, issues): if is_valid_author(new_author): new_author = capitalize_author(new_author) else: - warnings.append("Unable to find a valid name %s for author %s" % (new_author, author)) + warnings.append("Unable to find a valid name %s for author %s" % (author, temp_author)) print " * Replacing %s with %s" % (author, new_author) - line = line.replace(author, new_author) + # If we are in interactive mode, prompt the user whether we want to remember this new mapping + if INTERACTIVE_MODE and\ + author not in known_translations and\ + yesOrNoPrompt(" Add mapping %s -> %s to known translations file?" % (author, new_author)): + known_translations_file.write("%s - %s\n" % (author, new_author)) + known_translations_file.flush() + line = line.replace(temp_author, author) new_contributors_file.write(line) new_contributors_file.flush() print "==================================================================================\n" From 0efd691d983e6a1fa8b3a95ed1c03408febf679d Mon Sep 17 00:00:00 2001 From: Andrew Or Date: Tue, 16 Dec 2014 22:11:03 -0800 Subject: [PATCH 47/72] [Release] Update contributors list format and sort it Additionally, we now warn the user when a duplicate author name arises, in which case he/she needs to resolve it manually. Conflicts: .rat-excludes --- .gitignore | 2 +- .rat-excludes | 2 ++ dev/create-release/generate-contributors.py | 8 ++--- dev/create-release/translate-contributors.py | 34 ++++++++++++++------ 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index ed792506534c3..bb3209e984168 100644 --- a/.gitignore +++ b/.gitignore @@ -47,7 +47,7 @@ checkpoint derby.log dist/ dev/create-release/*txt -dev/create-release/*new +dev/create-release/*final spark-*-bin-*.tgz unit-tests.log /lib/ diff --git a/.rat-excludes b/.rat-excludes index eaefef1b0aa2e..32f2c4b981ac2 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -57,3 +57,5 @@ dist/* .*iws logs .*scalastyle-output.xml +.*dependency-reduced-pom.xml +dev/create-release/known_translations diff --git a/dev/create-release/generate-contributors.py b/dev/create-release/generate-contributors.py index e65c5d8233098..8aaa250bd7e29 100755 --- a/dev/create-release/generate-contributors.py +++ b/dev/create-release/generate-contributors.py @@ -192,9 +192,9 @@ def populate(issue_type, components): print "==================================================================================\n" # Write to contributors file ordered by author names -# Each line takes the format "Author name - semi-colon delimited contributions" -# e.g. Andrew Or - Bug fixes in Windows, Core, and Web UI; improvements in Core -# e.g. Tathagata Das - Bug fixes and new features in Streaming +# Each line takes the format " * Author name -- semi-colon delimited contributions" +# e.g. * Andrew Or -- Bug fixes in Windows, Core, and Web UI; improvements in Core +# e.g. * Tathagata Das -- Bug fixes and new features in Streaming contributors_file = open(contributors_file_name, "w") authors = author_info.keys() authors.sort() @@ -223,7 +223,7 @@ def populate(issue_type, components): # E.g. andrewor14/SPARK-3425/SPARK-1157/SPARK-6672 if author in invalid_authors and invalid_authors[author]: author = author + "/" + "/".join(invalid_authors[author]) - line = "%s - %s" % (author, contribution) + line = " * %s -- %s" % (author, contribution) contributors_file.write(line + "\n") contributors_file.close() print "Contributors list is successfully written to %s!" % contributors_file_name diff --git a/dev/create-release/translate-contributors.py b/dev/create-release/translate-contributors.py index f3b1efdd42785..86fa02d87b9a0 100755 --- a/dev/create-release/translate-contributors.py +++ b/dev/create-release/translate-contributors.py @@ -43,14 +43,12 @@ if not GITHUB_API_TOKEN: sys.exit("GITHUB_API_TOKEN must be set") -# Write new contributors list to .new +# Write new contributors list to .final if not os.path.isfile(contributors_file_name): print "Contributors file %s does not exist!" % contributors_file_name print "Have you run ./generate-contributors.py yet?" sys.exit(1) contributors_file = open(contributors_file_name, "r") -new_contributors_file_name = contributors_file_name + ".new" -new_contributors_file = open(new_contributors_file_name, "w") warnings = [] # In non-interactive mode, this script will choose the first replacement that is valid @@ -73,7 +71,7 @@ known_translations_file = open(known_translations_file_name, "r") for line in known_translations_file: if line.startswith("#"): continue - [old_name, new_name] = line.split(" - ") + [old_name, new_name] = line.strip("\n").split(" - ") known_translations[old_name] = new_name known_translations_file.close() @@ -147,16 +145,16 @@ def generate_candidates(author, issues): # If no such name exists, the original name is used (without the JIRA numbers). print "\n========================== Translating contributor list ==========================" lines = contributors_file.readlines() +contributions = [] for i, line in enumerate(lines): - temp_author = line.split(" - ")[0] + temp_author = line.strip(" * ").split(" -- ")[0] print "Processing author %s (%d/%d)" % (temp_author, i + 1, len(lines)) if not temp_author: - error_msg = " ERROR: Expected the following format - \n" + error_msg = " ERROR: Expected the following format \" * -- \"\n" error_msg += " ERROR: Actual = %s" % line print error_msg warnings.append(error_msg) - new_contributors_file.write(line) - new_contributors_file.flush() + contributions.append(line) continue author = temp_author.split("/")[0] # Use the local copy of known translations where possible @@ -222,10 +220,26 @@ def generate_candidates(author, issues): known_translations_file.write("%s - %s\n" % (author, new_author)) known_translations_file.flush() line = line.replace(temp_author, author) - new_contributors_file.write(line) - new_contributors_file.flush() + contributions.append(line) print "==================================================================================\n" contributors_file.close() +known_translations_file.close() + +# Sort the contributions before writing them to the new file. +# Additionally, check if there are any duplicate author rows. +# This could happen if the same user has both a valid full +# name (e.g. Andrew Or) and an invalid one (andrewor14). +# If so, warn the user about this at the end. +contributions.sort() +all_authors = set() +new_contributors_file_name = contributors_file_name + ".final" +new_contributors_file = open(new_contributors_file_name, "w") +for line in contributions: + author = line.strip(" * ").split(" -- ")[0] + if author in all_authors: + warnings.append("Detected duplicate author name %s. Please merge these manually." % author) + all_authors.add(author) + new_contributors_file.write(line) new_contributors_file.close() print "Translated contributors list successfully written to %s!" % new_contributors_file_name From c15e7f211b905cf17bda0d374164b9f70888d337 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 16 Dec 2014 23:00:25 -0800 Subject: [PATCH 48/72] [HOTFIX] Fix RAT exclusion for known_translations file Author: Josh Rosen Closes #3719 from JoshRosen/rat-fix and squashes the following commits: 1542886 [Josh Rosen] [HOTFIX] Fix RAT exclusion for known_translations file (cherry picked from commit 3d0c37b8118f6057a663f959321a79b8061132b6) Signed-off-by: Josh Rosen --- .rat-excludes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.rat-excludes b/.rat-excludes index 32f2c4b981ac2..0561fb5223039 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -58,4 +58,4 @@ dist/* logs .*scalastyle-output.xml .*dependency-reduced-pom.xml -dev/create-release/known_translations +known_translations From f4e6ffc4c359a2b1078fe95f8aca6894811e86ad Mon Sep 17 00:00:00 2001 From: Madhu Siddalingaiah Date: Thu, 18 Dec 2014 16:00:53 -0800 Subject: [PATCH 49/72] [SPARK-4884]: Improve Partition docs Rewording was based on this discussion: http://apache-spark-developers-list.1001551.n3.nabble.com/RDD-data-flow-td9804.html This is the associated JIRA ticket: https://issues.apache.org/jira/browse/SPARK-4884 Author: Madhu Siddalingaiah Closes #3722 from msiddalingaiah/master and squashes the following commits: 79e679f [Madhu Siddalingaiah] [DOC]: improve documentation 51d14b9 [Madhu Siddalingaiah] Merge remote-tracking branch 'upstream/master' 38faca4 [Madhu Siddalingaiah] Merge remote-tracking branch 'upstream/master' cbccbfe [Madhu Siddalingaiah] Documentation: replace with (again) 332f7a2 [Madhu Siddalingaiah] Documentation: replace with cd2b05a [Madhu Siddalingaiah] Merge remote-tracking branch 'upstream/master' 0fc12d7 [Madhu Siddalingaiah] Documentation: add description for repartitionAndSortWithinPartitions (cherry picked from commit d5a596d4188bfa85ff49ee85039f54255c19a4de) Signed-off-by: Josh Rosen --- core/src/main/scala/org/apache/spark/Partition.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Partition.scala b/core/src/main/scala/org/apache/spark/Partition.scala index 27892dbd2a0bc..dd3f28e4197e3 100644 --- a/core/src/main/scala/org/apache/spark/Partition.scala +++ b/core/src/main/scala/org/apache/spark/Partition.scala @@ -18,11 +18,11 @@ package org.apache.spark /** - * A partition of an RDD. + * An identifier for a partition in an RDD. */ trait Partition extends Serializable { /** - * Get the split's index within its parent RDD + * Get the partition's index within its parent RDD */ def index: Int From 2d6646359131e6046a8845e5b58f793acadb5ee3 Mon Sep 17 00:00:00 2001 From: Sandy Ryza Date: Thu, 18 Dec 2014 22:40:44 -0800 Subject: [PATCH 50/72] SPARK-3428. TaskMetrics for running tasks is missing GC time metrics Author: Sandy Ryza Closes #3684 from sryza/sandy-spark-3428 and squashes the following commits: cb827fe [Sandy Ryza] SPARK-3428. TaskMetrics for running tasks is missing GC time metrics (cherry picked from commit 283263ffaa941e7e9ba147cf0ad377d9202d3761) Signed-off-by: Josh Rosen --- .../main/scala/org/apache/spark/executor/Executor.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index fbdc7ecebba6e..3740989edf7d0 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -125,6 +125,8 @@ private[spark] class Executor( threadPool.shutdown() } + private def gcTime = ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum + class TaskRunner( execBackend: ExecutorBackend, val taskId: Long, taskName: String, serializedTask: ByteBuffer) extends Runnable { @@ -132,6 +134,7 @@ private[spark] class Executor( @volatile private var killed = false @volatile var task: Task[Any] = _ @volatile var attemptedTask: Option[Task[Any]] = None + @volatile var startGCTime: Long = _ def kill(interruptThread: Boolean) { logInfo(s"Executor is trying to kill $taskName (TID $taskId)") @@ -149,8 +152,7 @@ private[spark] class Executor( logInfo(s"Running $taskName (TID $taskId)") execBackend.statusUpdate(taskId, TaskState.RUNNING, EMPTY_BYTE_BUFFER) var taskStart: Long = 0 - def gcTime = ManagementFactory.getGarbageCollectorMXBeans.map(_.getCollectionTime).sum - val startGCTime = gcTime + startGCTime = gcTime try { SparkEnv.set(env) @@ -351,10 +353,13 @@ private[spark] class Executor( while (!isStopped) { val tasksMetrics = new ArrayBuffer[(Long, TaskMetrics)]() + val curGCTime = gcTime + for (taskRunner <- runningTasks.values()) { if (!taskRunner.attemptedTask.isEmpty) { Option(taskRunner.task).flatMap(_.metrics).foreach { metrics => metrics.updateShuffleReadMetrics + metrics.jvmGCTime = curGCTime - taskRunner.startGCTime if (isLocal) { // JobProgressListener will hold an reference of it during // onExecutorMetricsUpdate(), then JobProgressListener can not see From 546a239eb6b5f7952d4c2d5ee5bc400f2321b5b7 Mon Sep 17 00:00:00 2001 From: Ryan Williams Date: Fri, 19 Dec 2014 15:24:41 -0800 Subject: [PATCH 51/72] =?UTF-8?q?[SPARK-4896]=20don=E2=80=99t=20redundantl?= =?UTF-8?q?y=20overwrite=20executor=20JAR=20deps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Author: Ryan Williams Closes #2848 from ryan-williams/fetch-file and squashes the following commits: c14daff [Ryan Williams] Fix copy that was changed to a move inadvertently 8e39c16 [Ryan Williams] code review feedback 788ed41 [Ryan Williams] don’t redundantly overwrite executor JAR deps (cherry picked from commit 7981f969762e77f1752ef8f86c546d4fc32a1a4f) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/util/Utils.scala --- .../scala/org/apache/spark/util/Utils.scala | 156 ++++++++++++------ 1 file changed, 102 insertions(+), 54 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 20bf36e99e785..c11b2a89cb220 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -369,6 +369,104 @@ private[spark] object Utils extends Logging { uri.getQuery(), uri.getFragment()) } + /** + * Download `in` to `tempFile`, then move it to `destFile`. + * + * If `destFile` already exists: + * - no-op if its contents equal those of `sourceFile`, + * - throw an exception if `fileOverwrite` is false, + * - attempt to overwrite it otherwise. + * + * @param url URL that `sourceFile` originated from, for logging purposes. + * @param in InputStream to download. + * @param tempFile File path to download `in` to. + * @param destFile File path to move `tempFile` to. + * @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match + * `sourceFile` + */ + private def downloadFile( + url: String, + in: InputStream, + tempFile: File, + destFile: File, + fileOverwrite: Boolean): Unit = { + + try { + val out = new FileOutputStream(tempFile) + Utils.copyStream(in, out, closeStreams = true) + copyFile(url, tempFile, destFile, fileOverwrite, removeSourceFile = true) + } finally { + // Catch-all for the couple of cases where for some reason we didn't move `tempFile` to + // `destFile`. + if (tempFile.exists()) { + tempFile.delete() + } + } + } + + /** + * Copy `sourceFile` to `destFile`. + * + * If `destFile` already exists: + * - no-op if its contents equal those of `sourceFile`, + * - throw an exception if `fileOverwrite` is false, + * - attempt to overwrite it otherwise. + * + * @param url URL that `sourceFile` originated from, for logging purposes. + * @param sourceFile File path to copy/move from. + * @param destFile File path to copy/move to. + * @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match + * `sourceFile` + * @param removeSourceFile Whether to remove `sourceFile` after / as part of moving/copying it to + * `destFile`. + */ + private def copyFile( + url: String, + sourceFile: File, + destFile: File, + fileOverwrite: Boolean, + removeSourceFile: Boolean = false): Unit = { + + if (destFile.exists) { + if (!Files.equal(sourceFile, destFile)) { + if (fileOverwrite) { + logInfo( + s"File $destFile exists and does not match contents of $url, replacing it with $url" + ) + if (!destFile.delete()) { + throw new SparkException( + "Failed to delete %s while attempting to overwrite it with %s".format( + destFile.getAbsolutePath, + sourceFile.getAbsolutePath + ) + ) + } + } else { + throw new SparkException( + s"File $destFile exists and does not match contents of $url") + } + } else { + // Do nothing if the file contents are the same, i.e. this file has been copied + // previously. + logInfo( + "%s has been previously copied to %s".format( + sourceFile.getAbsolutePath, + destFile.getAbsolutePath + ) + ) + return + } + } + + // The file does not exist in the target directory. Copy or move it there. + if (removeSourceFile) { + Files.move(sourceFile, destFile) + } else { + logInfo(s"Copying ${sourceFile.getAbsolutePath} to ${destFile.getAbsolutePath}") + Files.copy(sourceFile, destFile) + } + } + /** * Download a file requested by the executor. Supports fetching the file in a variety of ways, * including HTTP, HDFS and files on a standard filesystem, based on the URL parameter. @@ -402,67 +500,17 @@ private[spark] object Utils extends Logging { uc.setReadTimeout(timeout) uc.connect() val in = uc.getInputStream() - val out = new FileOutputStream(tempFile) - Utils.copyStream(in, out, true) - if (targetFile.exists && !Files.equal(tempFile, targetFile)) { - if (fileOverwrite) { - targetFile.delete() - logInfo(("File %s exists and does not match contents of %s, " + - "replacing it with %s").format(targetFile, url, url)) - } else { - tempFile.delete() - throw new SparkException( - "File " + targetFile + " exists and does not match contents of" + " " + url) - } - } - Files.move(tempFile, targetFile) - case "file" | null => + downloadFile(url, in, tempFile, targetFile, fileOverwrite) + case "file" => // In the case of a local file, copy the local file to the target directory. // Note the difference between uri vs url. val sourceFile = if (uri.isAbsolute) new File(uri) else new File(url) - var shouldCopy = true - if (targetFile.exists) { - if (!Files.equal(sourceFile, targetFile)) { - if (fileOverwrite) { - targetFile.delete() - logInfo(("File %s exists and does not match contents of %s, " + - "replacing it with %s").format(targetFile, url, url)) - } else { - throw new SparkException( - "File " + targetFile + " exists and does not match contents of" + " " + url) - } - } else { - // Do nothing if the file contents are the same, i.e. this file has been copied - // previously. - logInfo(sourceFile.getAbsolutePath + " has been previously copied to " - + targetFile.getAbsolutePath) - shouldCopy = false - } - } - - if (shouldCopy) { - // The file does not exist in the target directory. Copy it there. - logInfo("Copying " + sourceFile.getAbsolutePath + " to " + targetFile.getAbsolutePath) - Files.copy(sourceFile, targetFile) - } + copyFile(url, sourceFile, targetFile, fileOverwrite) case _ => // Use the Hadoop filesystem library, which supports file://, hdfs://, s3://, and others val fs = getHadoopFileSystem(uri) val in = fs.open(new Path(uri)) - val out = new FileOutputStream(tempFile) - Utils.copyStream(in, out, true) - if (targetFile.exists && !Files.equal(tempFile, targetFile)) { - if (fileOverwrite) { - targetFile.delete() - logInfo(("File %s exists and does not match contents of %s, " + - "replacing it with %s").format(targetFile, url, url)) - } else { - tempFile.delete() - throw new SparkException( - "File " + targetFile + " exists and does not match contents of" + " " + url) - } - } - Files.move(tempFile, targetFile) + downloadFile(url, in, tempFile, targetFile, fileOverwrite) } // Decompress the file if it's a .tar or .tar.gz if (filename.endsWith(".tar.gz") || filename.endsWith(".tgz")) { From 3597c2ebfe9ca9cf13f7b3be4e6ac54f3a168e50 Mon Sep 17 00:00:00 2001 From: Kanwaljit Singh Date: Fri, 19 Dec 2014 19:25:39 -0800 Subject: [PATCH 52/72] SPARK-2641: Passing num executors to spark arguments from properties file Since we can set spark executor memory and executor cores using property file, we must also be allowed to set the executor instances. Author: Kanwaljit Singh Closes #1657 from kjsingh/branch-1.0 and squashes the following commits: d8a5a12 [Kanwaljit Singh] SPARK-2641: Fixing how spark arguments are loaded from properties file for num executors Conflicts: core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala --- .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 5075868321653..b3168356cf4c0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -121,6 +121,8 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St // This supports env vars in older versions of Spark master = Option(master).orElse(env.get("MASTER")).orNull deployMode = Option(deployMode).orElse(env.get("DEPLOY_MODE")).orNull + numExecutors = Option(numExecutors) + .getOrElse(defaultProperties.get("spark.executor.instances").orNull) // Try to set main class from JAR if no --class argument is given if (mainClass == null && !isPython && primaryResource != null) { From e5f27521d4ac837f37180c1dc8e214176d1e88ee Mon Sep 17 00:00:00 2001 From: huangzhaowei Date: Fri, 19 Dec 2014 23:32:56 -0800 Subject: [PATCH 53/72] [Minor] Build Failed: value defaultProperties not found Mvn Build Failed: value defaultProperties not found .Maybe related to this pr: https://github.com/apache/spark/commit/1d648123a77bbcd9b7a34cc0d66c14fa85edfecd andrewor14 can you look at this problem? Author: huangzhaowei Closes #3749 from SaintBacchus/Mvn-Build-Fail and squashes the following commits: 8e2917c [huangzhaowei] Build Failed: value defaultProperties not found (cherry picked from commit a764960b3b6d842eef7fa4777c8fa99d3f60fa1e) Signed-off-by: Josh Rosen --- .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index b3168356cf4c0..8b0a54e1190ae 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -122,7 +122,7 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St master = Option(master).orElse(env.get("MASTER")).orNull deployMode = Option(deployMode).orElse(env.get("DEPLOY_MODE")).orNull numExecutors = Option(numExecutors) - .getOrElse(defaultProperties.get("spark.executor.instances").orNull) + .getOrElse(sparkProperties.get("spark.executor.instances").orNull) // Try to set main class from JAR if no --class argument is given if (mainClass == null && !isPython && primaryResource != null) { From 3bce43f673a505c8c593130ba928c8834e03977b Mon Sep 17 00:00:00 2001 From: zsxwing Date: Mon, 22 Dec 2014 14:26:28 -0800 Subject: [PATCH 54/72] [SPARK-4818][Core] Add 'iterator' to reduce memory consumed by join In Scala, `map` and `flatMap` of `Iterable` will copy the contents of `Iterable` to a new `Seq`. Such as, ```Scala val iterable = Seq(1, 2, 3).map(v => { println(v) v }) println("Iterable map done") val iterator = Seq(1, 2, 3).iterator.map(v => { println(v) v }) println("Iterator map done") ``` outputed ``` 1 2 3 Iterable map done Iterator map done ``` So we should use 'iterator' to reduce memory consumed by join. Found by Johannes Simon in http://mail-archives.apache.org/mod_mbox/spark-user/201412.mbox/%3C5BE70814-9D03-4F61-AE2C-0D63F2DE4446%40mail.de%3E Author: zsxwing Closes #3671 from zsxwing/SPARK-4824 and squashes the following commits: 48ee7b9 [zsxwing] Remove the explicit types 95d59d6 [zsxwing] Add 'iterator' to reduce memory consumed by join (cherry picked from commit c233ab3d8d75a33495298964fe73dbf7dd8fe305) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala --- .../scala/org/apache/spark/rdd/PairRDDFunctions.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index 869321dac27fc..117c05d3b210f 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -472,7 +472,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) */ def join[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, W))] = { this.cogroup(other, partitioner).flatMapValues( pair => - for (v <- pair._1; w <- pair._2) yield (v, w) + for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, w) ) } @@ -485,9 +485,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) def leftOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (V, Option[W]))] = { this.cogroup(other, partitioner).flatMapValues { pair => if (pair._2.isEmpty) { - pair._1.map(v => (v, None)) + pair._1.iterator.map(v => (v, None)) } else { - for (v <- pair._1; w <- pair._2) yield (v, Some(w)) + for (v <- pair._1.iterator; w <- pair._2.iterator) yield (v, Some(w)) } } } @@ -502,9 +502,9 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) : RDD[(K, (Option[V], W))] = { this.cogroup(other, partitioner).flatMapValues { pair => if (pair._1.isEmpty) { - pair._2.map(w => (None, w)) + pair._2.iterator.map(w => (None, w)) } else { - for (v <- pair._1; w <- pair._2) yield (Some(v), w) + for (v <- pair._1.iterator; w <- pair._2.iterator) yield (Some(v), w) } } } From b1de461a7efd064e76e49d5e613aae25ada6f45d Mon Sep 17 00:00:00 2001 From: Ilayaperumal Gopinathan Date: Tue, 23 Dec 2014 15:14:54 -0800 Subject: [PATCH 55/72] [SPARK-4802] [streaming] Remove receiverInfo once receiver is de-registered Once the streaming receiver is de-registered at executor, the `ReceiverTrackerActor` needs to remove the corresponding reveiverInfo from the `receiverInfo` map at `ReceiverTracker`. Author: Ilayaperumal Gopinathan Closes #3647 from ilayaperumalg/receiverInfo-RTracker and squashes the following commits: 6eb97d5 [Ilayaperumal Gopinathan] Polishing based on the review 3640c86 [Ilayaperumal Gopinathan] Remove receiverInfo once receiver is de-registered (cherry picked from commit 10d69e9cbfdabe95d0e513176d5347d7b59da0ee) Signed-off-by: Tathagata Das Conflicts: streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala --- .../apache/spark/streaming/scheduler/ReceiverTracker.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 5307fe189d717..e748776f1a8c6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -139,8 +139,8 @@ class ReceiverTracker(ssc: StreamingContext) extends Logging { logWarning("No prior receiver info") ReceiverInfo(streamId, "", null, false, "", lastErrorMessage = message, lastError = error) } - receiverInfo(streamId) = newReceiverInfo - ssc.scheduler.listenerBus.post(StreamingListenerReceiverStopped(receiverInfo(streamId))) + receiverInfo -= streamId + ssc.scheduler.listenerBus.post(StreamingListenerReceiverStopped(newReceiverInfo)) val messageWithError = if (error != null && !error.isEmpty) { s"$message - $error" } else { From dd0287cca9ca6d8333f5397b36b0017e92a3fbf3 Mon Sep 17 00:00:00 2001 From: Marcelo Vanzin Date: Tue, 23 Dec 2014 16:02:59 -0800 Subject: [PATCH 56/72] [SPARK-4606] Send EOF to child JVM when there's no more data to read. Author: Marcelo Vanzin Closes #3460 from vanzin/SPARK-4606 and squashes the following commits: 031207d [Marcelo Vanzin] [SPARK-4606] Send EOF to child JVM when there's no more data to read. (cherry picked from commit 7e2deb71c4239564631b19c748e95c3d1aa1c77d) Signed-off-by: Josh Rosen --- .../SparkSubmitDriverBootstrapper.scala | 3 ++- .../scala/org/apache/spark/util/Utils.scala | 24 +++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala index a64170a47bc1c..40304ed64a1a9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitDriverBootstrapper.scala @@ -144,7 +144,8 @@ private[spark] object SparkSubmitDriverBootstrapper { val isWindows = Utils.isWindows val isPySparkShell = sys.env.contains("PYSPARK_SHELL") if (!isWindows) { - val stdinThread = new RedirectThread(System.in, process.getOutputStream, "redirect stdin") + val stdinThread = new RedirectThread(System.in, process.getOutputStream, "redirect stdin", + propagateEof = true) stdinThread.start() // For the PySpark shell, Spark submit itself runs as a python subprocess, and so this JVM // should terminate on broken pipe, which signals that the parent process has exited. In diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index c11b2a89cb220..4259145ec93d1 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1604,19 +1604,29 @@ private[spark] object Utils extends Logging { /** * A utility class to redirect the child process's stdout or stderr. */ -private[spark] class RedirectThread(in: InputStream, out: OutputStream, name: String) +private[spark] class RedirectThread( + in: InputStream, + out: OutputStream, + name: String, + propagateEof: Boolean = false) extends Thread(name) { setDaemon(true) override def run() { scala.util.control.Exception.ignoring(classOf[IOException]) { // FIXME: We copy the stream on the level of bytes to avoid encoding problems. - val buf = new Array[Byte](1024) - var len = in.read(buf) - while (len != -1) { - out.write(buf, 0, len) - out.flush() - len = in.read(buf) + try { + val buf = new Array[Byte](1024) + var len = in.read(buf) + while (len != -1) { + out.write(buf, 0, len) + out.flush() + len = in.read(buf) + } + } finally { + if (propagateEof) { + out.close() + } } } } From d21347dbb9799287794f18933eb8abf82221552f Mon Sep 17 00:00:00 2001 From: jerryshao Date: Thu, 25 Dec 2014 19:39:49 -0800 Subject: [PATCH 57/72] [SPARK-4537][Streaming] Expand StreamingSource to add more metrics Add `processingDelay`, `schedulingDelay` and `totalDelay` for the last completed batch. Add `lastReceivedBatchRecords` and `totalReceivedBatchRecords` to the received records counting. Author: jerryshao Closes #3466 from jerryshao/SPARK-4537 and squashes the following commits: 00f5f7f [jerryshao] Change the code style and add totalProcessedRecords 44721a6 [jerryshao] Further address the comments c097ddc [jerryshao] Address the comments 02dd44f [jerryshao] Fix the addressed comments c7a9376 [jerryshao] Expand StreamingSource to add more metrics (cherry picked from commit f205fe477c33a541053c198cd43a5811d6cf9fe2) Signed-off-by: Tathagata Das --- .../spark/streaming/StreamingSource.scala | 53 ++++++++++++++----- .../ui/StreamingJobProgressListener.scala | 19 ++++++- 2 files changed, 57 insertions(+), 15 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala index e35a568ddf115..9697437dd2fe5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingSource.scala @@ -29,9 +29,17 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source { private val streamingListener = ssc.progressListener private def registerGauge[T](name: String, f: StreamingJobProgressListener => T, - defaultValue: T) { + defaultValue: T): Unit = { + registerGaugeWithOption[T](name, + (l: StreamingJobProgressListener) => Option(f(streamingListener)), defaultValue) + } + + private def registerGaugeWithOption[T]( + name: String, + f: StreamingJobProgressListener => Option[T], + defaultValue: T): Unit = { metricRegistry.register(MetricRegistry.name("streaming", name), new Gauge[T] { - override def getValue: T = Option(f(streamingListener)).getOrElse(defaultValue) + override def getValue: T = f(streamingListener).getOrElse(defaultValue) }) } @@ -41,6 +49,12 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source { // Gauge for number of total completed batches registerGauge("totalCompletedBatches", _.numTotalCompletedBatches, 0L) + // Gauge for number of total received records + registerGauge("totalReceivedRecords", _.numTotalReceivedRecords, 0L) + + // Gauge for number of total processed records + registerGauge("totalProcessedRecords", _.numTotalProcessedRecords, 0L) + // Gauge for number of unprocessed batches registerGauge("unprocessedBatches", _.numUnprocessedBatches, 0L) @@ -55,19 +69,30 @@ private[streaming] class StreamingSource(ssc: StreamingContext) extends Source { // Gauge for last completed batch, useful for monitoring the streaming job's running status, // displayed data -1 for any abnormal condition. - registerGauge("lastCompletedBatch_submissionTime", - _.lastCompletedBatch.map(_.submissionTime).getOrElse(-1L), -1L) - registerGauge("lastCompletedBatch_processStartTime", - _.lastCompletedBatch.flatMap(_.processingStartTime).getOrElse(-1L), -1L) - registerGauge("lastCompletedBatch_processEndTime", - _.lastCompletedBatch.flatMap(_.processingEndTime).getOrElse(-1L), -1L) + registerGaugeWithOption("lastCompletedBatch_submissionTime", + _.lastCompletedBatch.map(_.submissionTime), -1L) + registerGaugeWithOption("lastCompletedBatch_processingStartTime", + _.lastCompletedBatch.flatMap(_.processingStartTime), -1L) + registerGaugeWithOption("lastCompletedBatch_processingEndTime", + _.lastCompletedBatch.flatMap(_.processingEndTime), -1L) + + // Gauge for last completed batch's delay information. + registerGaugeWithOption("lastCompletedBatch_processingDelay", + _.lastCompletedBatch.flatMap(_.processingDelay), -1L) + registerGaugeWithOption("lastCompletedBatch_schedulingDelay", + _.lastCompletedBatch.flatMap(_.schedulingDelay), -1L) + registerGaugeWithOption("lastCompletedBatch_totalDelay", + _.lastCompletedBatch.flatMap(_.totalDelay), -1L) // Gauge for last received batch, useful for monitoring the streaming job's running status, // displayed data -1 for any abnormal condition. - registerGauge("lastReceivedBatch_submissionTime", - _.lastCompletedBatch.map(_.submissionTime).getOrElse(-1L), -1L) - registerGauge("lastReceivedBatch_processStartTime", - _.lastCompletedBatch.flatMap(_.processingStartTime).getOrElse(-1L), -1L) - registerGauge("lastReceivedBatch_processEndTime", - _.lastCompletedBatch.flatMap(_.processingEndTime).getOrElse(-1L), -1L) + registerGaugeWithOption("lastReceivedBatch_submissionTime", + _.lastCompletedBatch.map(_.submissionTime), -1L) + registerGaugeWithOption("lastReceivedBatch_processingStartTime", + _.lastCompletedBatch.flatMap(_.processingStartTime), -1L) + registerGaugeWithOption("lastReceivedBatch_processingEndTime", + _.lastCompletedBatch.flatMap(_.processingEndTime), -1L) + + // Gauge for last received batch records. + registerGauge("lastReceivedBatch_records", _.lastReceivedBatchRecords.values.sum, 0L) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala index f61069b56db5e..5ee53a5c5f561 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala @@ -25,7 +25,6 @@ import org.apache.spark.streaming.scheduler.StreamingListenerBatchStarted import org.apache.spark.streaming.scheduler.BatchInfo import org.apache.spark.streaming.scheduler.StreamingListenerBatchSubmitted import org.apache.spark.util.Distribution -import org.apache.spark.Logging private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) @@ -36,6 +35,8 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) private val completedaBatchInfos = new Queue[BatchInfo] private val batchInfoLimit = ssc.conf.getInt("spark.streaming.ui.retainedBatches", 100) private var totalCompletedBatches = 0L + private var totalReceivedRecords = 0L + private var totalProcessedRecords = 0L private val receiverInfos = new HashMap[Int, ReceiverInfo] val batchDuration = ssc.graph.batchDuration.milliseconds @@ -65,6 +66,10 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) override def onBatchStarted(batchStarted: StreamingListenerBatchStarted) = synchronized { runningBatchInfos(batchStarted.batchInfo.batchTime) = batchStarted.batchInfo waitingBatchInfos.remove(batchStarted.batchInfo.batchTime) + + batchStarted.batchInfo.receivedBlockInfo.foreach { case (_, infos) => + totalReceivedRecords += infos.map(_.numRecords).sum + } } override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) = synchronized { @@ -73,6 +78,10 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) completedaBatchInfos.enqueue(batchCompleted.batchInfo) if (completedaBatchInfos.size > batchInfoLimit) completedaBatchInfos.dequeue() totalCompletedBatches += 1L + + batchCompleted.batchInfo.receivedBlockInfo.foreach { case (_, infos) => + totalProcessedRecords += infos.map(_.numRecords).sum + } } def numReceivers = synchronized { @@ -83,6 +92,14 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext) totalCompletedBatches } + def numTotalReceivedRecords: Long = synchronized { + totalReceivedRecords + } + + def numTotalProcessedRecords: Long = synchronized { + totalProcessedRecords + } + def numUnprocessedBatches: Long = synchronized { waitingBatchInfos.size + runningBatchInfos.size } From 3442b7bb69c662c69ad2ef105abb8d8d8db34599 Mon Sep 17 00:00:00 2001 From: GuoQiang Li Date: Fri, 26 Dec 2014 23:31:29 -0800 Subject: [PATCH 58/72] [SPARK-4952][Core]Handle ConcurrentModificationExceptions in SparkEnv.environmentDetails Author: GuoQiang Li Closes #3788 from witgo/SPARK-4952 and squashes the following commits: d903529 [GuoQiang Li] Handle ConcurrentModificationExceptions in SparkEnv.environmentDetails (cherry picked from commit 080ceb771a1e6b9f844cfd4f1baa01133c106888) Signed-off-by: Patrick Wendell --- core/src/main/scala/org/apache/spark/SparkEnv.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 72716567ca99b..19ba6e989d44b 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -318,7 +318,7 @@ object SparkEnv extends Logging { val sparkProperties = (conf.getAll ++ schedulerMode).sorted // System properties that are not java classpaths - val systemProperties = System.getProperties.iterator.toSeq + val systemProperties = Utils.getSystemProperties.toSeq val otherProperties = systemProperties.filter { case (k, _) => k != "java.class.path" && !k.startsWith("spark.") }.sorted From d5e0a45edb00038f7f2aba3a34bcee1117bd98d9 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Mon, 29 Dec 2014 22:03:07 -0800 Subject: [PATCH 59/72] [HOTFIX] Add SPARK_VERSION to Spark package object. This helps to avoid build breaks when backporting patches that use org.apache.spark.SPARK_VERSION. --- core/src/main/scala/org/apache/spark/package.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/scala/org/apache/spark/package.scala b/core/src/main/scala/org/apache/spark/package.scala index 5cdbc306e56a0..8caff7eb05762 100644 --- a/core/src/main/scala/org/apache/spark/package.scala +++ b/core/src/main/scala/org/apache/spark/package.scala @@ -44,4 +44,5 @@ package org.apache package object spark { // For package docs only + val SPARK_VERSION = "1.1.2-SNAPSHOT" } From 822a0b42f79acea2771d0b298e803c11c37aab81 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 30 Dec 2014 09:29:52 -0800 Subject: [PATCH 60/72] [SPARK-4882] Register PythonBroadcast with Kryo so that PySpark works with KryoSerializer This PR fixes an issue where PySpark broadcast variables caused NullPointerExceptions if KryoSerializer was used. The fix is to register PythonBroadcast with Kryo so that it's deserialized with a KryoJavaSerializer. Author: Josh Rosen Closes #3831 from JoshRosen/SPARK-4882 and squashes the following commits: 0466c7a [Josh Rosen] Register PythonBroadcast with Kryo. d5b409f [Josh Rosen] Enable registrationRequired, which would have caught this bug. 069d8a7 [Josh Rosen] Add failing test for SPARK-4882 (cherry picked from commit efa80a531ecd485f6cf0cdc24ffa42ba17eea46d) Signed-off-by: Josh Rosen --- .../spark/serializer/KryoSerializer.scala | 2 + .../api/python/PythonBroadcastSuite.scala | 60 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 87ef9bb0b43c6..0c8bd62fd34bd 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -26,6 +26,7 @@ import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializ import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator} import org.apache.spark._ +import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.broadcast.HttpBroadcast import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage._ @@ -79,6 +80,7 @@ class KryoSerializer(conf: SparkConf) // Allow sending SerializableWritable kryo.register(classOf[SerializableWritable[_]], new KryoJavaSerializer()) kryo.register(classOf[HttpBroadcast[_]], new KryoJavaSerializer()) + kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer()) // Allow the user to register their own classes by setting spark.kryo.registrator for (regCls <- registrator) { diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala new file mode 100644 index 0000000000000..8959a843dbd7d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.python + +import scala.io.Source + +import java.io.{PrintWriter, File} + +import org.scalatest.{Matchers, FunSuite} + +import org.apache.spark.{SharedSparkContext, SparkConf} +import org.apache.spark.serializer.KryoSerializer +import org.apache.spark.util.Utils + +// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize +// a PythonBroadcast: +class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext { + test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { + val tempDir = Utils.createTempDir() + val broadcastedString = "Hello, world!" + def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { + val source = Source.fromFile(broadcast.path) + val contents = source.mkString + source.close() + contents should be (broadcastedString) + } + try { + val broadcastDataFile: File = { + val file = new File(tempDir, "broadcastData") + val printWriter = new PrintWriter(file) + printWriter.write(broadcastedString) + printWriter.close() + file + } + val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) + assertBroadcastIsValid(broadcast) + val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") + val deserializedBroadcast = + Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) + assertBroadcastIsValid(deserializedBroadcast) + } finally { + Utils.deleteRecursively(tempDir) + } + } +} From d6b8d2c0307b5cc2bacd0c07139003821055ca69 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 30 Dec 2014 09:33:01 -0800 Subject: [PATCH 61/72] Revert "[SPARK-4882] Register PythonBroadcast with Kryo so that PySpark works with KryoSerializer" This reverts commit 822a0b42f79acea2771d0b298e803c11c37aab81. This fix does not apply to branch-1.1 or branch-1.0, since PythonBroadcast is new in 1.2. --- .../spark/serializer/KryoSerializer.scala | 2 - .../api/python/PythonBroadcastSuite.scala | 60 ------------------- 2 files changed, 62 deletions(-) delete mode 100644 core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 0c8bd62fd34bd..87ef9bb0b43c6 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -26,7 +26,6 @@ import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializ import com.twitter.chill.{AllScalaRegistrar, EmptyScalaKryoInstantiator} import org.apache.spark._ -import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.broadcast.HttpBroadcast import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage._ @@ -80,7 +79,6 @@ class KryoSerializer(conf: SparkConf) // Allow sending SerializableWritable kryo.register(classOf[SerializableWritable[_]], new KryoJavaSerializer()) kryo.register(classOf[HttpBroadcast[_]], new KryoJavaSerializer()) - kryo.register(classOf[PythonBroadcast], new KryoJavaSerializer()) // Allow the user to register their own classes by setting spark.kryo.registrator for (regCls <- registrator) { diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala deleted file mode 100644 index 8959a843dbd7d..0000000000000 --- a/core/src/test/scala/org/apache/spark/api/python/PythonBroadcastSuite.scala +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.api.python - -import scala.io.Source - -import java.io.{PrintWriter, File} - -import org.scalatest.{Matchers, FunSuite} - -import org.apache.spark.{SharedSparkContext, SparkConf} -import org.apache.spark.serializer.KryoSerializer -import org.apache.spark.util.Utils - -// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize -// a PythonBroadcast: -class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext { - test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { - val tempDir = Utils.createTempDir() - val broadcastedString = "Hello, world!" - def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { - val source = Source.fromFile(broadcast.path) - val contents = source.mkString - source.close() - contents should be (broadcastedString) - } - try { - val broadcastDataFile: File = { - val file = new File(tempDir, "broadcastData") - val printWriter = new PrintWriter(file) - printWriter.write(broadcastedString) - printWriter.close() - file - } - val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) - assertBroadcastIsValid(broadcast) - val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") - val deserializedBroadcast = - Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) - assertBroadcastIsValid(deserializedBroadcast) - } finally { - Utils.deleteRecursively(tempDir) - } - } -} From eac740e9a11d852d4a192d8c8f63862ed29c43fd Mon Sep 17 00:00:00 2001 From: zsxwing Date: Tue, 30 Dec 2014 14:39:13 -0800 Subject: [PATCH 62/72] [SPARK-4813][Streaming] Fix the issue that ContextWaiter didn't handle 'spurious wakeup' Used `Condition` to rewrite `ContextWaiter` because it provides a convenient API `awaitNanos` for timeout. Author: zsxwing Closes #3661 from zsxwing/SPARK-4813 and squashes the following commits: 52247f5 [zsxwing] Add explicit unit type be42bcf [zsxwing] Update as per review suggestion e06bd4f [zsxwing] Fix the issue that ContextWaiter didn't handle 'spurious wakeup' (cherry picked from commit 6a897829444e2ef273586511f93a40d36e64fb0b) Signed-off-by: Tathagata Das --- .../spark/streaming/ContextWaiter.scala | 63 ++++++++++++++----- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala index a0aeacbc733bd..fdbbe2aa6ef08 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala @@ -17,30 +17,63 @@ package org.apache.spark.streaming +import java.util.concurrent.TimeUnit +import java.util.concurrent.locks.ReentrantLock + private[streaming] class ContextWaiter { + + private val lock = new ReentrantLock() + private val condition = lock.newCondition() + + // Guarded by "lock" private var error: Throwable = null - private var stopped: Boolean = false - def notifyError(e: Throwable) = synchronized { - error = e - notifyAll() - } + // Guarded by "lock" + private var stopped: Boolean = false - def notifyStop() = synchronized { - stopped = true - notifyAll() + def notifyError(e: Throwable): Unit = { + lock.lock() + try { + error = e + condition.signalAll() + } finally { + lock.unlock() + } } - def waitForStopOrError(timeout: Long = -1) = synchronized { - // If already had error, then throw it - if (error != null) { - throw error + def notifyStop(): Unit = { + lock.lock() + try { + stopped = true + condition.signalAll() + } finally { + lock.unlock() } + } - // If not already stopped, then wait - if (!stopped) { - if (timeout < 0) wait() else wait(timeout) + /** + * Return `true` if it's stopped; or throw the reported error if `notifyError` has been called; or + * `false` if the waiting time detectably elapsed before return from the method. + */ + def waitForStopOrError(timeout: Long = -1): Boolean = { + lock.lock() + try { + if (timeout < 0) { + while (!stopped && error == null) { + condition.await() + } + } else { + var nanos = TimeUnit.MILLISECONDS.toNanos(timeout) + while (!stopped && error == null && nanos > 0) { + nanos = condition.awaitNanos(nanos) + } + } + // If already had error, then throw it if (error != null) throw error + // already stopped or timeout + stopped + } finally { + lock.unlock() } } } From babcafaa8695a0599939c5844a0570489eb5f27d Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Tue, 30 Dec 2014 18:12:20 -0800 Subject: [PATCH 63/72] [SPARK-1010] Clean up uses of System.setProperty in unit tests Several of our tests call System.setProperty (or test code which implicitly sets system properties) and don't always reset/clear the modified properties, which can create ordering dependencies between tests and cause hard-to-diagnose failures. This patch removes most uses of System.setProperty from our tests, since in most cases we can use SparkConf to set these configurations (there are a few exceptions, including the tests of SparkConf itself). For the cases where we continue to use System.setProperty, this patch introduces a `ResetSystemProperties` ScalaTest mixin class which snapshots the system properties before individual tests and to automatically restores them on test completion / failure. See the block comment at the top of the ResetSystemProperties class for more details. Author: Josh Rosen Closes #3739 from JoshRosen/cleanup-system-properties-in-tests and squashes the following commits: 0236d66 [Josh Rosen] Replace setProperty uses in two example programs / tools 3888fe3 [Josh Rosen] Remove setProperty use in LocalJavaStreamingContext 4f4031d [Josh Rosen] Add note on why SparkSubmitSuite needs ResetSystemProperties 4742a5b [Josh Rosen] Clarify ResetSystemProperties trait inheritance ordering. 0eaf0b6 [Josh Rosen] Remove setProperty call in TaskResultGetterSuite. 7a3d224 [Josh Rosen] Fix trait ordering 3fdb554 [Josh Rosen] Remove setProperty call in TaskSchedulerImplSuite bee20df [Josh Rosen] Remove setProperty calls in SparkContextSchedulerCreationSuite 655587c [Josh Rosen] Remove setProperty calls in JobCancellationSuite 3f2f955 [Josh Rosen] Remove System.setProperty calls in DistributedSuite cfe9cce [Josh Rosen] Remove use of system properties in SparkContextSuite 8783ab0 [Josh Rosen] Remove TestUtils.setSystemProperty, since it is subsumed by the ResetSystemProperties trait. 633a84a [Josh Rosen] Remove use of system properties in FileServerSuite 25bfce2 [Josh Rosen] Use ResetSystemProperties in UtilsSuite 1d1aa5a [Josh Rosen] Use ResetSystemProperties in SizeEstimatorSuite dd9492b [Josh Rosen] Use ResetSystemProperties in AkkaUtilsSuite b0daff2 [Josh Rosen] Use ResetSystemProperties in BlockManagerSuite e9ded62 [Josh Rosen] Use ResetSystemProperties in TaskSchedulerImplSuite 5b3cb54 [Josh Rosen] Use ResetSystemProperties in SparkListenerSuite 0995c4b [Josh Rosen] Use ResetSystemProperties in SparkContextSchedulerCreationSuite c83ded8 [Josh Rosen] Use ResetSystemProperties in SparkConfSuite 51aa870 [Josh Rosen] Use withSystemProperty in ShuffleSuite 60a63a1 [Josh Rosen] Use ResetSystemProperties in JobCancellationSuite 14a92e4 [Josh Rosen] Use withSystemProperty in FileServerSuite 628f46c [Josh Rosen] Use ResetSystemProperties in DistributedSuite 9e3e0dd [Josh Rosen] Add ResetSystemProperties test fixture mixin; use it in SparkSubmitSuite. 4dcea38 [Josh Rosen] Move withSystemProperty to TestUtils class. (cherry picked from commit 352ed6bbe3c3b67e52e298e7c535ae414d96beca) Signed-off-by: Josh Rosen Conflicts: core/src/test/scala/org/apache/spark/ShuffleSuite.scala core/src/test/scala/org/apache/spark/SparkConfSuite.scala core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala core/src/test/scala/org/apache/spark/SparkContextSuite.scala core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala core/src/test/scala/org/apache/spark/util/UtilsSuite.scala external/flume/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java external/mqtt/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java external/twitter/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java external/zeromq/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala --- .../org/apache/spark/DistributedSuite.scala | 21 ++----- .../org/apache/spark/FileServerSuite.scala | 16 +++--- .../apache/spark/JobCancellationSuite.scala | 21 ++++--- .../scala/org/apache/spark/ShuffleSuite.scala | 22 +++---- .../org/apache/spark/SparkConfSuite.scala | 52 +++++++---------- .../SparkContextSchedulerCreationSuite.scala | 31 +++++----- .../spark/deploy/SparkSubmitSuite.scala | 6 +- .../spark/scheduler/SparkListenerSuite.scala | 9 +-- .../scheduler/TaskResultGetterSuite.scala | 23 +++----- .../scheduler/TaskSchedulerImplSuite.scala | 6 +- .../spark/storage/BlockManagerSuite.scala | 24 +++----- .../apache/spark/util/AkkaUtilsSuite.scala | 2 +- .../spark/util/ResetSystemProperties.scala | 57 +++++++++++++++++++ .../spark/util/SizeEstimatorSuite.scala | 38 +++---------- .../org/apache/spark/util/UtilsSuite.scala | 2 +- .../apache/spark/examples/BroadcastTest.scala | 6 +- .../streaming/LocalJavaStreamingContext.java | 8 ++- .../spark/tools/StoragePerfTester.scala | 9 +-- 18 files changed, 170 insertions(+), 183 deletions(-) create mode 100644 core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index 41c294f727b3c..a36eef2dabcf5 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -17,7 +17,6 @@ package org.apache.spark -import org.scalatest.BeforeAndAfter import org.scalatest.FunSuite import org.scalatest.concurrent.Timeouts._ import org.scalatest.Matchers @@ -31,16 +30,10 @@ class NotSerializableClass class NotSerializableExn(val notSer: NotSerializableClass) extends Throwable() {} -class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter - with LocalSparkContext { +class DistributedSuite extends FunSuite with Matchers with LocalSparkContext { val clusterUrl = "local-cluster[2,1,512]" - after { - System.clearProperty("spark.reducer.maxMbInFlight") - System.clearProperty("spark.storage.memoryFraction") - } - test("task throws not serializable exception") { // Ensures that executors do not crash when an exn is not serializable. If executors crash, // this test will hang. Correct behavior is that executors don't crash but fail tasks @@ -86,15 +79,14 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter } test("groupByKey where map output sizes exceed maxMbInFlight") { - System.setProperty("spark.reducer.maxMbInFlight", "1") - sc = new SparkContext(clusterUrl, "test") + val conf = new SparkConf().set("spark.reducer.maxMbInFlight", "1") + sc = new SparkContext(clusterUrl, "test", conf) // This data should be around 20 MB, so even with 4 mappers and 2 reducers, each map output // file should be about 2.5 MB val pairs = sc.parallelize(1 to 2000, 4).map(x => (x % 16, new Array[Byte](10000))) val groups = pairs.groupByKey(2).map(x => (x._1, x._2.size)).collect() assert(groups.length === 16) assert(groups.map(_._2).sum === 2000) - // Note that spark.reducer.maxMbInFlight will be cleared in the test suite's after{} block } test("accumulators") { @@ -211,7 +203,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter } test("compute without caching when no partitions fit in memory") { - System.setProperty("spark.storage.memoryFraction", "0.0001") sc = new SparkContext(clusterUrl, "test") // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache // to only 50 KB (0.0001 of 512 MB), so no partitions should fit in memory @@ -219,12 +210,11 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter assert(data.count() === 4000000) assert(data.count() === 4000000) assert(data.count() === 4000000) - System.clearProperty("spark.storage.memoryFraction") } test("compute when only some partitions fit in memory") { - System.setProperty("spark.storage.memoryFraction", "0.01") - sc = new SparkContext(clusterUrl, "test") + val conf = new SparkConf().set("spark.storage.memoryFraction", "0.01") + sc = new SparkContext(clusterUrl, "test", conf) // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache // to only 5 MB (0.01 of 512 MB), so not all of it will fit in memory; we use 20 partitions // to make sure that *some* of them do fit though @@ -232,7 +222,6 @@ class DistributedSuite extends FunSuite with Matchers with BeforeAndAfter assert(data.count() === 4000000) assert(data.count() === 4000000) assert(data.count() === 4000000) - System.clearProperty("spark.storage.memoryFraction") } test("passing environment variables to cluster") { diff --git a/core/src/test/scala/org/apache/spark/FileServerSuite.scala b/core/src/test/scala/org/apache/spark/FileServerSuite.scala index 5997e014c5ca8..9ffe7df364d39 100644 --- a/core/src/test/scala/org/apache/spark/FileServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileServerSuite.scala @@ -33,10 +33,11 @@ class FileServerSuite extends FunSuite with LocalSparkContext { @transient var tmpFile: File = _ @transient var tmpJarUrl: String = _ + def newConf: SparkConf = new SparkConf(loadDefaults = false).set("spark.authenticate", "false") + override def beforeEach() { super.beforeEach() resetSparkContext() - System.setProperty("spark.authenticate", "false") } override def beforeAll() { @@ -55,7 +56,6 @@ class FileServerSuite extends FunSuite with LocalSparkContext { val jarFile = new File(testTempDir, "test.jar") val jarStream = new FileOutputStream(jarFile) val jar = new JarOutputStream(jarStream, new java.util.jar.Manifest()) - System.setProperty("spark.authenticate", "false") val jarEntry = new JarEntry(textFile.getName) jar.putNextEntry(jarEntry) @@ -77,7 +77,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test("Distributing files locally") { - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addFile(tmpFile.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -111,7 +111,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { test("Distributing files locally using URL as input") { // addFile("file:///....") - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addFile(new File(tmpFile.toString).toURI.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -125,7 +125,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS locally") { - sc = new SparkContext("local[4]", "test") + sc = new SparkContext("local[4]", "test", newConf) sc.addJar(tmpJarUrl) val testData = Array((1, 1)) sc.parallelize(testData).foreach { x => @@ -136,7 +136,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test("Distributing files on a standalone cluster") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addFile(tmpFile.toString) val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0)) val result = sc.parallelize(testData).reduceByKey { @@ -150,7 +150,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS on a standalone cluster") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addJar(tmpJarUrl) val testData = Array((1,1)) sc.parallelize(testData).foreach { x => @@ -161,7 +161,7 @@ class FileServerSuite extends FunSuite with LocalSparkContext { } test ("Dynamically adding JARS on a standalone cluster using local: URL") { - sc = new SparkContext("local-cluster[1,1,512]", "test") + sc = new SparkContext("local-cluster[1,1,512]", "test", newConf) sc.addJar(tmpJarUrl.replace("file", "local")) val testData = Array((1,1)) sc.parallelize(testData).foreach { x => diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala index a57430e829ced..a5d2b426df077 100644 --- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala +++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala @@ -41,12 +41,11 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter override def afterEach() { super.afterEach() resetSparkContext() - System.clearProperty("spark.scheduler.mode") } test("local mode, FIFO scheduler") { - System.setProperty("spark.scheduler.mode", "FIFO") - sc = new SparkContext("local[2]", "test") + val conf = new SparkConf().set("spark.scheduler.mode", "FIFO") + sc = new SparkContext("local[2]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -54,10 +53,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("local mode, fair scheduler") { - System.setProperty("spark.scheduler.mode", "FAIR") + val conf = new SparkConf().set("spark.scheduler.mode", "FAIR") val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) - sc = new SparkContext("local[2]", "test") + conf.set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local[2]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -65,8 +64,8 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("cluster mode, FIFO scheduler") { - System.setProperty("spark.scheduler.mode", "FIFO") - sc = new SparkContext("local-cluster[2,1,512]", "test") + val conf = new SparkConf().set("spark.scheduler.mode", "FIFO") + sc = new SparkContext("local-cluster[2,1,512]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. @@ -74,10 +73,10 @@ class JobCancellationSuite extends FunSuite with Matchers with BeforeAndAfter } test("cluster mode, fair scheduler") { - System.setProperty("spark.scheduler.mode", "FAIR") + val conf = new SparkConf().set("spark.scheduler.mode", "FAIR") val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) - sc = new SparkContext("local-cluster[2,1,512]", "test") + conf.set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local-cluster[2,1,512]", "test", conf) testCount() testTake() // Make sure we can still launch tasks. diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index 704b38233d202..625702bb7bc2a 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -31,19 +31,15 @@ class ShuffleSuite extends FunSuite with Matchers with LocalSparkContext { val conf = new SparkConf(loadDefaults = false) test("groupByKey without compression") { - try { - System.setProperty("spark.shuffle.compress", "false") - sc = new SparkContext("local", "test") - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4) - val groups = pairs.groupByKey(4).collect() - assert(groups.size === 2) - val valuesFor1 = groups.find(_._1 == 1).get._2 - assert(valuesFor1.toList.sorted === List(1, 2, 3)) - val valuesFor2 = groups.find(_._1 == 2).get._2 - assert(valuesFor2.toList.sorted === List(1)) - } finally { - System.setProperty("spark.shuffle.compress", "true") - } + val myConf = conf.clone().set("spark.shuffle.compress", "false") + sc = new SparkContext("local", "test", myConf) + val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4) + val groups = pairs.groupByKey(4).collect() + assert(groups.size === 2) + val valuesFor1 = groups.find(_._1 == 1).get._2 + assert(valuesFor1.toList.sorted === List(1, 2, 3)) + val valuesFor2 = groups.find(_._1 == 2).get._2 + assert(valuesFor2.toList.sorted === List(1)) } test("shuffle non-zero block size") { diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 87e9012622456..ae8a3c78c91ff 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -19,25 +19,19 @@ package org.apache.spark import org.scalatest.FunSuite -class SparkConfSuite extends FunSuite with LocalSparkContext { +import org.apache.spark.util.ResetSystemProperties + +class SparkConfSuite extends FunSuite with LocalSparkContext with ResetSystemProperties { test("loading from system properties") { - try { - System.setProperty("spark.test.testProperty", "2") - val conf = new SparkConf() - assert(conf.get("spark.test.testProperty") === "2") - } finally { - System.clearProperty("spark.test.testProperty") - } + System.setProperty("spark.test.testProperty", "2") + val conf = new SparkConf() + assert(conf.get("spark.test.testProperty") === "2") } test("initializing without loading defaults") { - try { - System.setProperty("spark.test.testProperty", "2") - val conf = new SparkConf(false) - assert(!conf.contains("spark.test.testProperty")) - } finally { - System.clearProperty("spark.test.testProperty") - } + System.setProperty("spark.test.testProperty", "2") + val conf = new SparkConf(false) + assert(!conf.contains("spark.test.testProperty")) } test("named set methods") { @@ -115,22 +109,16 @@ class SparkConfSuite extends FunSuite with LocalSparkContext { test("nested property names") { // This wasn't supported by some external conf parsing libraries - try { - System.setProperty("spark.test.a", "a") - System.setProperty("spark.test.a.b", "a.b") - System.setProperty("spark.test.a.b.c", "a.b.c") - val conf = new SparkConf() - assert(conf.get("spark.test.a") === "a") - assert(conf.get("spark.test.a.b") === "a.b") - assert(conf.get("spark.test.a.b.c") === "a.b.c") - conf.set("spark.test.a.b", "A.B") - assert(conf.get("spark.test.a") === "a") - assert(conf.get("spark.test.a.b") === "A.B") - assert(conf.get("spark.test.a.b.c") === "a.b.c") - } finally { - System.clearProperty("spark.test.a") - System.clearProperty("spark.test.a.b") - System.clearProperty("spark.test.a.b.c") - } + System.setProperty("spark.test.a", "a") + System.setProperty("spark.test.a.b", "a.b") + System.setProperty("spark.test.a.b.c", "a.b.c") + val conf = new SparkConf() + assert(conf.get("spark.test.a") === "a") + assert(conf.get("spark.test.a.b") === "a.b") + assert(conf.get("spark.test.a.b.c") === "a.b.c") + conf.set("spark.test.a.b", "A.B") + assert(conf.get("spark.test.a") === "a") + assert(conf.get("spark.test.a.b") === "A.B") + assert(conf.get("spark.test.a.b.c") === "a.b.c") } } diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala index 495a0d48633a4..161b7159b4e3c 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala @@ -27,10 +27,13 @@ import org.apache.spark.scheduler.local.LocalBackend class SparkContextSchedulerCreationSuite extends FunSuite with PrivateMethodTester with Logging with BeforeAndAfterEach { - def createTaskScheduler(master: String): TaskSchedulerImpl = { + def createTaskScheduler(master: String): TaskSchedulerImpl = + createTaskScheduler(master, new SparkConf()) + + def createTaskScheduler(master: String, conf: SparkConf): TaskSchedulerImpl = { // Create local SparkContext to setup a SparkEnv. We don't actually want to start() the // real schedulers, so we don't want to create a full SparkContext with the desired scheduler. - val sc = new SparkContext("local", "test") + val sc = new SparkContext("local", "test", conf) val createTaskSchedulerMethod = PrivateMethod[TaskScheduler]('createTaskScheduler) val sched = SparkContext invokePrivate createTaskSchedulerMethod(sc, master) sched.asInstanceOf[TaskSchedulerImpl] @@ -101,19 +104,13 @@ class SparkContextSchedulerCreationSuite } test("local-default-parallelism") { - val defaultParallelism = System.getProperty("spark.default.parallelism") - System.setProperty("spark.default.parallelism", "16") - val sched = createTaskScheduler("local") + val conf = new SparkConf().set("spark.default.parallelism", "16") + val sched = createTaskScheduler("local", conf) sched.backend match { case s: LocalBackend => assert(s.defaultParallelism() === 16) case _ => fail() } - - Option(defaultParallelism) match { - case Some(v) => System.setProperty("spark.default.parallelism", v) - case _ => System.clearProperty("spark.default.parallelism") - } } test("simr") { @@ -154,9 +151,10 @@ class SparkContextSchedulerCreationSuite testYarn("yarn-client", "org.apache.spark.scheduler.cluster.YarnClientClusterScheduler") } - def testMesos(master: String, expectedClass: Class[_]) { + def testMesos(master: String, expectedClass: Class[_], coarse: Boolean) { + val conf = new SparkConf().set("spark.mesos.coarse", coarse.toString) try { - val sched = createTaskScheduler(master) + val sched = createTaskScheduler(master, conf) assert(sched.backend.getClass === expectedClass) } catch { case e: UnsatisfiedLinkError => @@ -167,17 +165,14 @@ class SparkContextSchedulerCreationSuite } test("mesos fine-grained") { - System.setProperty("spark.mesos.coarse", "false") - testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend]) + testMesos("mesos://localhost:1234", classOf[MesosSchedulerBackend], coarse = false) } test("mesos coarse-grained") { - System.setProperty("spark.mesos.coarse", "true") - testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend]) + testMesos("mesos://localhost:1234", classOf[CoarseMesosSchedulerBackend], coarse = true) } test("mesos with zookeeper") { - System.setProperty("spark.mesos.coarse", "false") - testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend]) + testMesos("zk://localhost:1234,localhost:2345", classOf[MesosSchedulerBackend], coarse = false) } } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 4cba90e8f2afe..dba00edbd9a4b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -23,12 +23,14 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkConf, SparkContext, SparkEnv, SparkException, TestUtils} import org.apache.spark.deploy.SparkSubmit._ -import org.apache.spark.util.Utils +import org.apache.spark.util.{ResetSystemProperties, Utils} import org.scalatest.FunSuite import org.scalatest.Matchers import com.google.common.io.Files -class SparkSubmitSuite extends FunSuite with Matchers { +// Note: this suite mixes in ResetSystemProperties because SparkSubmit.main() sets a bunch +// of properties that neeed to be cleared after tests. +class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties { def beforeAll() { System.setProperty("spark.testing", "true") } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index 3b0b8e2f68c97..9ba49784bd754 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -27,9 +27,10 @@ import org.scalatest.Matchers import org.apache.spark.{LocalSparkContext, SparkContext} import org.apache.spark.SparkContext._ import org.apache.spark.executor.TaskMetrics +import org.apache.spark.util.ResetSystemProperties -class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers - with BeforeAndAfter with BeforeAndAfterAll { +class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers with BeforeAndAfter + with BeforeAndAfterAll with ResetSystemProperties { /** Length of time to wait while draining listener events. */ val WAIT_TIMEOUT_MILLIS = 10000 @@ -38,10 +39,6 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with Matchers sc = new SparkContext("local", "SparkListenerSuite") } - override def afterAll() { - System.clearProperty("spark.akka.frameSize") - } - test("basic creation and shutdown of LiveListenerBus") { val counter = new BasicJobCounter val bus = new LiveListenerBus diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala index c4e7a4bb7d385..8b224ab3266eb 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala @@ -21,7 +21,7 @@ import java.nio.ByteBuffer import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite} -import org.apache.spark.{LocalSparkContext, SparkContext, SparkEnv} +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv} import org.apache.spark.storage.TaskResultBlockId /** @@ -55,27 +55,20 @@ class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedule /** * Tests related to handling task results (both direct and indirect). */ -class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll - with LocalSparkContext { +class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with LocalSparkContext { - override def beforeAll { - // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small - // as we can make it) so the tests don't take too long. - System.setProperty("spark.akka.frameSize", "1") - } - - override def afterAll { - System.clearProperty("spark.akka.frameSize") - } + // Set the Akka frame size to be as small as possible (it must be an integer, so 1 is as small + // as we can make it) so the tests don't take too long. + def conf: SparkConf = new SparkConf().set("spark.akka.frameSize", "1") test("handling results smaller than Akka frame size") { - sc = new SparkContext("local", "test") + sc = new SparkContext("local", "test", conf) val result = sc.parallelize(Seq(1), 1).map(x => 2 * x).reduce((x, y) => x) assert(result === 2) } test("handling results larger than Akka frame size") { - sc = new SparkContext("local", "test") + sc = new SparkContext("local", "test", conf) val akkaFrameSize = sc.env.actorSystem.settings.config.getBytes("akka.remote.netty.tcp.maximum-frame-size").toInt val result = sc.parallelize(Seq(1), 1).map(x => 1.to(akkaFrameSize).toArray).reduce((x, y) => x) @@ -89,7 +82,7 @@ class TaskResultGetterSuite extends FunSuite with BeforeAndAfter with BeforeAndA test("task retried if result missing from block manager") { // Set the maximum number of task failures to > 0, so that the task set isn't aborted // after the result is missing. - sc = new SparkContext("local[1,2]", "test") + sc = new SparkContext("local[1,2]", "test", conf) // If this test hangs, it's probably because no resource offers were made after the task // failed. val scheduler: TaskSchedulerImpl = sc.taskScheduler match { diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index 7532da88c6065..40aaf9dd1f1e9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -162,12 +162,12 @@ class TaskSchedulerImplSuite extends FunSuite with LocalSparkContext with Loggin } test("Fair Scheduler Test") { - sc = new SparkContext("local", "TaskSchedulerImplSuite") + val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() + val conf = new SparkConf().set("spark.scheduler.allocation.file", xmlPath) + sc = new SparkContext("local", "TaskSchedulerImplSuite", conf) val taskScheduler = new TaskSchedulerImpl(sc) val taskSet = FakeTask.createTaskSet(1) - val xmlPath = getClass.getClassLoader.getResource("fairscheduler.xml").getFile() - System.setProperty("spark.scheduler.allocation.file", xmlPath) val rootPool = new Pool("", SchedulingMode.FAIR, 0, 0) val schedulableBuilder = new FairSchedulableBuilder(rootPool, sc.conf) schedulableBuilder.buildPools() diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 48c45bfe6b3e9..b212fa7fd3d42 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -31,10 +31,9 @@ import org.mockito.Matchers.any import org.mockito.Mockito.{doAnswer, mock, spy, when} import org.mockito.stubbing.Answer -import org.scalatest.{BeforeAndAfter, FunSuite, PrivateMethodTester} +import org.scalatest._ import org.scalatest.concurrent.Eventually._ import org.scalatest.concurrent.Timeouts._ -import org.scalatest.Matchers import org.apache.spark.{MapOutputTrackerMaster, SecurityManager, SparkConf} import org.apache.spark.executor.DataReadMethod @@ -42,7 +41,7 @@ import org.apache.spark.network.{Message, ConnectionManagerId} import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat -import org.apache.spark.util.{AkkaUtils, ByteBufferInputStream, SizeEstimator, Utils} +import org.apache.spark.util._ import scala.collection.mutable.ArrayBuffer import scala.concurrent.Await @@ -50,15 +49,14 @@ import scala.concurrent.duration._ import scala.language.implicitConversions import scala.language.postfixOps -class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter - with PrivateMethodTester { +class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfterEach + with PrivateMethodTester with ResetSystemProperties { private val conf = new SparkConf(false) var store: BlockManager = null var store2: BlockManager = null var actorSystem: ActorSystem = null var master: BlockManagerMaster = null - var oldArch: String = null conf.set("spark.authenticate", "false") val securityMgr = new SecurityManager(conf) val mapOutputTracker = new MapOutputTrackerMaster(conf) @@ -77,13 +75,13 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter mapOutputTracker, shuffleManager) } - before { + override def beforeEach(): Unit = { val (actorSystem, boundPort) = AkkaUtils.createActorSystem( "test", "localhost", 0, conf = conf, securityManager = securityMgr) this.actorSystem = actorSystem // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - oldArch = System.setProperty("os.arch", "amd64") + System.setProperty("os.arch", "amd64") conf.set("os.arch", "amd64") conf.set("spark.test.useCompressedOops", "true") conf.set("spark.driver.port", boundPort.toString) @@ -98,7 +96,7 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter SizeEstimator invokePrivate initialize() } - after { + override def afterEach(): Unit = { if (store != null) { store.stop() store = null @@ -111,14 +109,6 @@ class BlockManagerSuite extends FunSuite with Matchers with BeforeAndAfter actorSystem.awaitTermination() actorSystem = null master = null - - if (oldArch != null) { - conf.set("os.arch", oldArch) - } else { - System.clearProperty("os.arch") - } - - System.clearProperty("spark.test.useCompressedOops") } test("StorageLevel object caching") { diff --git a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala index c4765e53de17b..640e45f3f368f 100644 --- a/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/AkkaUtilsSuite.scala @@ -28,7 +28,7 @@ import scala.concurrent.Await /** * Test the AkkaUtils with various security settings. */ -class AkkaUtilsSuite extends FunSuite with LocalSparkContext { +class AkkaUtilsSuite extends FunSuite with LocalSparkContext with ResetSystemProperties { test("remote fetch security bad password") { val conf = new SparkConf diff --git a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala new file mode 100644 index 0000000000000..d4b92f33dd9e6 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.util.Properties + +import org.scalatest.{BeforeAndAfterEach, Suite} + +/** + * Mixin for automatically resetting system properties that are modified in ScalaTest tests. + * This resets the properties after each individual test. + * + * The order in which fixtures are mixed in affects the order in which they are invoked by tests. + * If we have a suite `MySuite extends FunSuite with Foo with Bar`, then + * Bar's `super` is Foo, so Bar's beforeEach() will and afterEach() methods will be invoked first + * by the rest runner. + * + * This means that ResetSystemProperties should appear as the last trait in test suites that it's + * mixed into in order to ensure that the system properties snapshot occurs as early as possible. + * ResetSystemProperties calls super.afterEach() before performing its own cleanup, ensuring that + * the old properties are restored as late as possible. + * + * See the "Composing fixtures by stacking traits" section at + * http://www.scalatest.org/user_guide/sharing_fixtures for more details about this pattern. + */ +private[spark] trait ResetSystemProperties extends BeforeAndAfterEach { this: Suite => + var oldProperties: Properties = null + + override def beforeEach(): Unit = { + oldProperties = new Properties(System.getProperties) + super.beforeEach() + } + + override def afterEach(): Unit = { + try { + super.afterEach() + } finally { + System.setProperties(oldProperties) + oldProperties = null + } + } +} diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index f9d1af88f3a13..c60f4347b88fd 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -17,9 +17,7 @@ package org.apache.spark.util -import org.scalatest.BeforeAndAfterAll -import org.scalatest.FunSuite -import org.scalatest.PrivateMethodTester +import org.scalatest.{BeforeAndAfterEach, BeforeAndAfterAll, FunSuite, PrivateMethodTester} class DummyClass1 {} @@ -46,20 +44,12 @@ class DummyString(val arr: Array[Char]) { } class SizeEstimatorSuite - extends FunSuite with BeforeAndAfterAll with PrivateMethodTester { + extends FunSuite with BeforeAndAfterEach with PrivateMethodTester with ResetSystemProperties { - var oldArch: String = _ - var oldOops: String = _ - - override def beforeAll() { + override def beforeEach() { // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case - oldArch = System.setProperty("os.arch", "amd64") - oldOops = System.setProperty("spark.test.useCompressedOops", "true") - } - - override def afterAll() { - resetOrClear("os.arch", oldArch) - resetOrClear("spark.test.useCompressedOops", oldOops) + System.setProperty("os.arch", "amd64") + System.setProperty("spark.test.useCompressedOops", "true") } test("simple classes") { @@ -122,7 +112,7 @@ class SizeEstimatorSuite } test("32-bit arch") { - val arch = System.setProperty("os.arch", "x86") + System.setProperty("os.arch", "x86") val initialize = PrivateMethod[Unit]('initialize) SizeEstimator invokePrivate initialize() @@ -131,14 +121,13 @@ class SizeEstimatorSuite assertResult(48)(SizeEstimator.estimate(DummyString("a"))) assertResult(48)(SizeEstimator.estimate(DummyString("ab"))) assertResult(56)(SizeEstimator.estimate(DummyString("abcdefgh"))) - resetOrClear("os.arch", arch) } // NOTE: The String class definition varies across JDK versions (1.6 vs. 1.7) and vendors // (Sun vs IBM). Use a DummyString class to make tests deterministic. test("64-bit arch with no compressed oops") { - val arch = System.setProperty("os.arch", "amd64") - val oops = System.setProperty("spark.test.useCompressedOops", "false") + System.setProperty("os.arch", "amd64") + System.setProperty("spark.test.useCompressedOops", "false") val initialize = PrivateMethod[Unit]('initialize) SizeEstimator invokePrivate initialize() @@ -146,16 +135,5 @@ class SizeEstimatorSuite assertResult(64)(SizeEstimator.estimate(DummyString("a"))) assertResult(64)(SizeEstimator.estimate(DummyString("ab"))) assertResult(72)(SizeEstimator.estimate(DummyString("abcdefgh"))) - - resetOrClear("os.arch", arch) - resetOrClear("spark.test.useCompressedOops", oops) - } - - def resetOrClear(prop: String, oldValue: String) { - if (oldValue != null) { - System.setProperty(prop, oldValue) - } else { - System.clearProperty(prop) - } } } diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index a530e0be183b0..5ae3da2bd1f5f 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -29,7 +29,7 @@ import com.google.common.base.Charsets import com.google.common.io.Files import org.scalatest.FunSuite -class UtilsSuite extends FunSuite { +class UtilsSuite extends FunSuite with ResetSystemProperties { test("bytesToString") { assert(Utils.bytesToString(10) === "10.0 B") diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala index 973049b95a7bd..f49a96edb94df 100644 --- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala +++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala @@ -28,11 +28,9 @@ object BroadcastTest { val bcName = if (args.length > 2) args(2) else "Http" val blockSize = if (args.length > 3) args(3) else "4096" - System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName + - "BroadcastFactory") - System.setProperty("spark.broadcast.blockSize", blockSize) val sparkConf = new SparkConf().setAppName("Broadcast Test") - + .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroaddcastFactory") + .set("spark.broadcast.blockSize", blockSize) val sc = new SparkContext(sparkConf) val slices = if (args.length > 0) args(0).toInt else 2 diff --git a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java index 6e1f01900071b..1e24da7f5f60c 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java +++ b/streaming/src/test/java/org/apache/spark/streaming/LocalJavaStreamingContext.java @@ -17,6 +17,7 @@ package org.apache.spark.streaming; +import org.apache.spark.SparkConf; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.junit.After; import org.junit.Before; @@ -27,8 +28,11 @@ public abstract class LocalJavaStreamingContext { @Before public void setUp() { - System.setProperty("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); - ssc = new JavaStreamingContext("local[2]", "test", new Duration(1000)); + SparkConf conf = new SparkConf() + .setMaster("local[2]") + .setAppName("test") + .set("spark.streaming.clock", "org.apache.spark.streaming.util.ManualClock"); + ssc = new JavaStreamingContext(conf, new Duration(1000)); ssc.checkpoint("checkpoint"); } diff --git a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala index 17bf7c2541d13..efe2a4f591c4f 100644 --- a/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala +++ b/tools/src/main/scala/org/apache/spark/tools/StoragePerfTester.scala @@ -20,7 +20,7 @@ package org.apache.spark.tools import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong -import org.apache.spark.SparkContext +import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils import org.apache.spark.executor.ShuffleWriteMetrics @@ -48,11 +48,12 @@ object StoragePerfTester { val writeData = "1" * recordLength val executor = Executors.newFixedThreadPool(numMaps) - System.setProperty("spark.shuffle.compress", "false") - System.setProperty("spark.shuffle.sync", "true") + val conf = new SparkConf() + .set("spark.shuffle.compress", "false") + .set("spark.shuffle.sync", "true") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. - val sc = new SparkContext("local[4]", "Write Tester") + val sc = new SparkContext("local[4]", "Write Tester", conf) val blockManager = sc.env.blockManager def writeOutputBytes(mapId: Int, total: AtomicLong) = { From 08d4f704871ebe13b875755a1cf2fadb44da43e4 Mon Sep 17 00:00:00 2001 From: Brennon York Date: Wed, 31 Dec 2014 11:54:10 -0800 Subject: [PATCH 64/72] [SPARK-4298][Core] - The spark-submit cannot read Main-Class from Manifest. Resolves a bug where the `Main-Class` from a .jar file wasn't being read in properly. This was caused by the fact that the `primaryResource` object was a URI and needed to be normalized through a call to `.getPath` before it could be passed into the `JarFile` object. Author: Brennon York Closes #3561 from brennonyork/SPARK-4298 and squashes the following commits: 5e0fce1 [Brennon York] Use string interpolation for error messages, moved comment line from original code to above its necessary code segment 14daa20 [Brennon York] pushed mainClass assignment into match statement, removed spurious spaces, removed { } from case statements, removed return values c6dad68 [Brennon York] Set case statement to support multiple jar URI's and enabled the 'file' URI to load the main-class 8d20936 [Brennon York] updated to reset the error message back to the default a043039 [Brennon York] updated to split the uri and jar vals 8da7cbf [Brennon York] fixes SPARK-4298 (cherry picked from commit 8e14c5eb551ab06c94859c7f6d8c6b62b4d00d59) Signed-off-by: Josh Rosen Conflicts: core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala --- .../spark/deploy/SparkSubmitArguments.scala | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 8b0a54e1190ae..dc14bd6d0bcab 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -19,6 +19,7 @@ package org.apache.spark.deploy import java.io.{File, FileInputStream, IOException} import java.util.Properties +import java.net.URI import java.util.jar.JarFile import scala.collection.JavaConversions._ @@ -126,14 +127,23 @@ private[spark] class SparkSubmitArguments(args: Seq[String], env: Map[String, St // Try to set main class from JAR if no --class argument is given if (mainClass == null && !isPython && primaryResource != null) { - try { - val jar = new JarFile(primaryResource) - // Note that this might still return null if no main-class is set; we catch that later - mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") - } catch { - case e: Exception => - SparkSubmit.printErrorAndExit("Cannot load main class from JAR: " + primaryResource) - return + val uri = new URI(primaryResource) + val uriScheme = uri.getScheme() + + uriScheme match { + case "file" => + try { + val jar = new JarFile(uri.getPath) + // Note that this might still return null if no main-class is set; we catch that later + mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") + } catch { + case e: Exception => + SparkSubmit.printErrorAndExit(s"Cannot load main class from JAR $primaryResource") + } + case _ => + SparkSubmit.printErrorAndExit( + s"Cannot load main class from JAR $primaryResource with URI $uriScheme. " + + "Please specify a class through --class.") } } From 1034707c7a366578674122d0d6fa00f2f9dce7e0 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 12 Dec 2014 12:38:37 -0800 Subject: [PATCH 65/72] [HOTFIX] Disable Spark UI in SparkSubmitSuite tests This should fix a major cause of build breaks when running many parallel tests. --- .../test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index dba00edbd9a4b..05e7ac2d11481 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -291,6 +291,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--class", SimpleApplicationTest.getClass.getName.stripSuffix("$"), "--name", "testApp", "--master", "local", + "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } @@ -305,6 +306,7 @@ class SparkSubmitSuite extends FunSuite with Matchers with ResetSystemProperties "--name", "testApp", "--master", "local-cluster[2,1,512]", "--jars", jarsString, + "--conf", "spark.ui.enabled=false", unusedJar.toString) runSparkSubmit(args) } From 61eb9be4b1a6597666d59baf1f53541b47633234 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 31 Dec 2014 16:02:47 -0800 Subject: [PATCH 66/72] [SPARK-5035] [Streaming] ReceiverMessage trait should extend Serializable Spark Streaming's ReceiverMessage trait should extend Serializable in order to fix a subtle bug that only occurs when running on a real cluster: If you attempt to send a fire-and-forget message to a remote Akka actor and that message cannot be serialized, then this seems to lead to more-or-less silent failures. As an optimization, Akka skips message serialization for messages sent within the same JVM. As a result, Spark's unit tests will never fail due to non-serializable Akka messages, but these will cause mostly-silent failures when running on a real cluster. Before this patch, here was the code for ReceiverMessage: ``` /** Messages sent to the NetworkReceiver. */ private[streaming] sealed trait ReceiverMessage private[streaming] object StopReceiver extends ReceiverMessage ``` Since ReceiverMessage does not extend Serializable and StopReceiver is a regular `object`, not a `case object`, StopReceiver will throw serialization errors. As a result, graceful receiver shutdown is broken on real clusters (and local-cluster mode) but works in local modes. If you want to reproduce this, try running the word count example from the Streaming Programming Guide in the Spark shell: ``` import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ val ssc = new StreamingContext(sc, Seconds(10)) // Create a DStream that will connect to hostname:port, like localhost:9999 val lines = ssc.socketTextStream("localhost", 9999) // Split each line into words val words = lines.flatMap(_.split(" ")) import org.apache.spark.streaming.StreamingContext._ // Count each word in each batch val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _) // Print the first ten elements of each RDD generated in this DStream to the console wordCounts.print() ssc.start() Thread.sleep(10000) ssc.stop(true, true) ``` Prior to this patch, this would work correctly in local mode but fail when running against a real cluster (it would report that some receivers were not shut down). Author: Josh Rosen Closes #3857 from JoshRosen/SPARK-5035 and squashes the following commits: 71d0eae [Josh Rosen] [SPARK-5035] ReceiverMessage trait should extend Serializable. (cherry picked from commit fe6efacc0b865e9e827a1565877077000e63976e) Signed-off-by: Tathagata Das --- .../org/apache/spark/streaming/receiver/ReceiverMessage.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala index bf39d1e891cae..ab9fa192191aa 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverMessage.scala @@ -18,6 +18,6 @@ package org.apache.spark.streaming.receiver /** Messages sent to the NetworkReceiver. */ -private[streaming] sealed trait ReceiverMessage +private[streaming] sealed trait ReceiverMessage extends Serializable private[streaming] object StopReceiver extends ReceiverMessage From c532acf5e9902c27c23b4c23980d72fa48bd23ec Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 1 Jan 2015 15:03:54 -0800 Subject: [PATCH 67/72] [HOTFIX] Bind web UI to ephemeral port in DriverSuite The job launched by DriverSuite should bind the web UI to an ephemeral port, since it looks like port contention in this test has caused a large number of Jenkins failures when many builds are started simultaneously. Our tests already disable the web UI, but this doesn't affect subprocesses launched by our tests. In this case, I've opted to bind to an ephemeral port instead of disabling the UI because disabling features in this test may mask its ability to catch certain bugs. See also: e24d3a9 Author: Josh Rosen Closes #3873 from JoshRosen/driversuite-webui-port and squashes the following commits: 48cd05c [Josh Rosen] [HOTFIX] Bind web UI to ephemeral port in DriverSuite. (cherry picked from commit 012839807c3dc6e7c8c41ac6e956d52a550bb031) Signed-off-by: Josh Rosen --- core/src/test/scala/org/apache/spark/DriverSuite.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala index 4b1d280624c57..5d2633680d632 100644 --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala @@ -52,7 +52,10 @@ class DriverSuite extends FunSuite with Timeouts { object DriverWithoutCleanup { def main(args: Array[String]) { Utils.configTestLog4j("INFO") - val sc = new SparkContext(args(0), "DriverWithoutCleanup") + // Bind the web UI to an ephemeral port in order to avoid conflicts with other tests running on + // the same machine (we shouldn't just disable the UI here, since that might mask bugs): + val conf = new SparkConf().set("spark.ui.port", "0") + val sc = new SparkContext(args(0), "DriverWithoutCleanup", conf) sc.parallelize(1 to 100, 4).count() } } From 6d1ca2392a403184511ea1e8508d72778901d57e Mon Sep 17 00:00:00 2001 From: Dale Date: Sun, 4 Jan 2015 13:28:37 -0800 Subject: [PATCH 68/72] [SPARK-4787] Stop SparkContext if a DAGScheduler init error occurs Author: Dale Closes #3809 from tigerquoll/SPARK-4787 and squashes the following commits: 5661e01 [Dale] [SPARK-4787] Ensure that call to stop() doesn't lose the exception by using a finally block. 2172578 [Dale] [SPARK-4787] Stop context properly if an exception occurs during DAGScheduler initialization. (cherry picked from commit 3fddc9468fa50e7683caa973fec6c52e1132268d) Signed-off-by: Josh Rosen --- core/src/main/scala/org/apache/spark/SparkContext.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index e1e7e83570c4b..61c50a8953ff4 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -324,8 +324,13 @@ class SparkContext(config: SparkConf) extends Logging { try { dagScheduler = new DAGScheduler(this) } catch { - case e: Exception => throw - new SparkException("DAGScheduler cannot be initialized due to %s".format(e.getMessage)) + case e: Exception => { + try { + stop() + } finally { + throw new SparkException("Error while constructing DAGScheduler", e) + } + } } // start TaskScheduler after taskScheduler sets DAGScheduler reference in DAGScheduler's From 55325afbedded660f7ce8e335986ab0c0eea26ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?hushan=5B=E8=83=A1=E7=8F=8A=5D?= Date: Wed, 7 Jan 2015 12:09:12 -0800 Subject: [PATCH 69/72] [SPARK-5132][Core]Correct stage Attempt Id key in stageInfofromJson MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SPARK-5132: stageInfoToJson: Stage Attempt Id stageInfoFromJson: Attempt Id Author: hushan[胡珊] Closes #3932 from suyanNone/json-stage and squashes the following commits: 41419ab [hushan[胡珊]] Correct stage Attempt Id key in stageInfofromJson (cherry picked from commit d345ebebd554ac3faa4e870bd7800ed02e89da58) Signed-off-by: Josh Rosen --- core/src/main/scala/org/apache/spark/util/JsonProtocol.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index 2f7481e0ab4d7..b6fdfafb752bb 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -496,7 +496,7 @@ private[spark] object JsonProtocol { def stageInfoFromJson(json: JValue): StageInfo = { val stageId = (json \ "Stage ID").extract[Int] - val attemptId = (json \ "Attempt ID").extractOpt[Int].getOrElse(0) + val attemptId = (json \ "Stage Attempt ID").extractOpt[Int].getOrElse(0) val stageName = (json \ "Stage Name").extract[String] val numTasks = (json \ "Number of Tasks").extract[Int] val rddInfos = (json \ "RDD Info").extract[List[JValue]].map(rddInfoFromJson(_)) From 677281edcee67b0e3a6beef0f382e5ba865f2be8 Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Fri, 2 Jan 2015 08:59:09 -0800 Subject: [PATCH 70/72] Added ByteswapPartitioner and made it the defaultPartitioner --- .../scala/org/apache/spark/Partitioner.scala | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala index e53a78ead2c0e..8e7e3dd459d37 100644 --- a/core/src/main/scala/org/apache/spark/Partitioner.scala +++ b/core/src/main/scala/org/apache/spark/Partitioner.scala @@ -60,9 +60,9 @@ object Partitioner { return r.partitioner.get } if (rdd.context.conf.contains("spark.default.parallelism")) { - new HashPartitioner(rdd.context.defaultParallelism) + new ByteswapPartitioner(rdd.context.defaultParallelism) } else { - new HashPartitioner(bySize.head.partitions.size) + new ByteswapPartitioner(bySize.head.partitions.size) } } } @@ -93,6 +93,18 @@ class HashPartitioner(partitions: Int) extends Partitioner { override def hashCode: Int = numPartitions } +/** + * A [[org.apache.spark.Partitioner]] that implements hash-based partitioning using + * Java's `Object.hashCode`. In order to spread-out hashCodes that are divisible by + * `numPartitions`, `byteswap32` is applied to the hashCodes before modding by `numPartitions`. + */ +class ByteswapPartitioner(partitions: Int) extends HashPartitioner(partitions) { + override def getPartition(key: Any): Int = key match { + case null => 0 + case _ => Utils.nonNegativeMod(byteswap32(key.hashCode), numPartitions) + } +} + /** * A [[org.apache.spark.Partitioner]] that partitions sortable records by range into roughly * equal ranges. The ranges are determined by sampling the content of the RDD passed in. From 470f026e88488f157e1418943c30e668b27080eb Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Mon, 5 Jan 2015 15:18:34 -0800 Subject: [PATCH 71/72] Added spark.default.partitioner to conf --- .../main/scala/org/apache/spark/Partitioner.scala | 13 +++++++++++-- .../scala/org/apache/spark/rdd/PipedRDDSuite.scala | 10 ++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala index 8e7e3dd459d37..214b9be8c7d81 100644 --- a/core/src/main/scala/org/apache/spark/Partitioner.scala +++ b/core/src/main/scala/org/apache/spark/Partitioner.scala @@ -55,14 +55,23 @@ object Partitioner { * We use two method parameters (rdd, others) to enforce callers passing at least 1 RDD. */ def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = { + val shortPartitionerNames = Map( + "hash" -> "org.apache.spark.HashPartitioner", + "byteswap" -> "org.apache.spark.ByteswapPartitioner" + ) + val defaultPartitionerName = rdd.conf.get("spark.default.partitioner", "hash") + val className = + shortPartitionerNames.getOrElse(defaultPartitionerName.toLowerCase, defaultPartitionerName) + val ctor = Class.forName(className, true, Utils.getContextOrSparkClassLoader) + .getConstructor(classOf[Int]) val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse for (r <- bySize if r.partitioner.isDefined) { return r.partitioner.get } if (rdd.context.conf.contains("spark.default.parallelism")) { - new ByteswapPartitioner(rdd.context.defaultParallelism) + ctor.newInstance(rdd.context.defaultParallelism: java.lang.Integer).asInstanceOf[Partitioner] } else { - new ByteswapPartitioner(bySize.head.partitions.size) + ctor.newInstance(bySize.head.partitions.size: java.lang.Integer).asInstanceOf[Partitioner] } } } diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala index be972c5e97a7e..9f11db88f92b2 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala @@ -32,6 +32,16 @@ import scala.util.Try class PipedRDDSuite extends FunSuite with SharedSparkContext { + override def beforeAll() { + System.setProperty("spark.default.partitioner", "hash") + super.beforeAll() + } + + override def afterAll() { + System.clearProperty("spark.default.partitioner") + super.afterAll() + } + test("basic pipe") { if (testCommandAvailable("cat")) { val nums = sc.makeRDD(Array(1, 2, 3, 4), 2) From 99910b19d70519bd45e7251b739f7158f8685fcd Mon Sep 17 00:00:00 2001 From: Mark Hamstra Date: Mon, 5 Jan 2015 16:56:19 -0800 Subject: [PATCH 72/72] spark.default.partitioner docs --- docs/configuration.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/configuration.md b/docs/configuration.md index 13fc251c1733f..029213558fcd4 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -488,6 +488,17 @@ Apart from these, the following properties are also available, and may be useful (groupByKey, reduceByKey, etc) when not set by user. + + spark.default.partitioner + hash + + Implementation to use for partitioning key-value data pairs. There are two implementations + available:hash and byteswap. Both are based on the hashCode of + the keys mod the number of partitions, but the byteswap partitioner also + applies byteswap32 to the hash codes, which helps guarantee that all partitions are used + even when the hash codes are divisible by a factor of the number of partitions. + + spark.broadcast.factory org.apache.spark.broadcast.
TorrentBroadcastFactory