From 754085f2c35f385ad743202818a0eeeefc1bc714 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Wed, 16 Jul 2014 16:56:55 -0700 Subject: [PATCH] Explain why broadcasting serialized copy of the task. --- core/src/main/scala/org/apache/spark/rdd/RDD.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index 0fda13b3a6823..ff355546989d2 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -1207,8 +1207,10 @@ abstract class RDD[T: ClassTag]( // ======================================================================= /** - * Broadcasted copy of this RDD, used to dispatch tasks to executors. Note that this is - * a lazy val so the broadcast is created only when tasks are scheduled on this RDD. + * Broadcasted copy of this RDD, used to dispatch tasks to executors. Note that we broadcast + * the serialized copy of the RDD and for each task we will deserialize it, which means each + * task gets a different copy of the RDD. This provides stronger isolation between tasks that + * might modify state of objects referenced in their closures. */ @transient private[spark] lazy val broadcasted = { val ser = SparkEnv.get.closureSerializer.newInstance()