[SPARK-25258][SPARK-23131][SPARK-25176][BUILD] Upgrade Kryo to 4.0.2

## What changes were proposed in this pull request? Upgrade chill to 0.9.3, Kryo to 4.0.2, to get bug fixes and improvements. The resolved tickets includes: - SPARK-25258 Upgrade kryo package to version 4.0.2 - SPARK-23131 Kryo raises StackOverflow during serializing GLR model - SPARK-25176 Kryo fails to serialize a parametrised type hierarchy More details: https://github.com/twitter/chill/releases/tag/v0.9.3 twitter/chill@cc3910d ## How was this patch tested? Existing tests. Closes #22179 from wangyum/SPARK-23131. Lead-authored-by: Yuming Wang <yumwang@ebay.com> Co-authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Sean Owen <sean.owen@databricks.com>
apache · Sep 5, 2018 · 3e03303 · 3e03303
1 parent 458468a
commit 3e03303
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 15 deletions.
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -412,6 +412,26 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
     assert(!ser2.getAutoReset)
   }
 
+  test("SPARK-25176 ClassCastException when writing a Map after previously " +
+    "reading a Map with different generic type") {
+    // This test uses the example in https://github.com/EsotericSoftware/kryo/issues/384
+    import java.util._
+    val ser = new KryoSerializer(new SparkConf).newInstance().asInstanceOf[KryoSerializerInstance]
+
+    class MapHolder {
+      private val mapOne = new HashMap[Int, String]
+      private val mapTwo = this.mapOne
+    }
+
+    val serializedMapHolder = ser.serialize(new MapHolder)
+    ser.deserialize[MapHolder](serializedMapHolder)
+
+    val stringMap = new HashMap[Int, List[String]]
+    stringMap.put(1, new ArrayList[String])
+    val serializedMap = ser.serialize[Map[Int, List[String]]](stringMap)
+    ser.deserialize[HashMap[Int, List[String]]](serializedMap)
+  }
+
   private def testSerializerInstanceReuse(autoReset: Boolean, referenceTracking: Boolean): Unit = {
     val conf = new SparkConf(loadDefaults = false)
       .set("spark.kryo.referenceTracking", referenceTracking.toString)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
@@ -27,8 +27,8 @@ breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.4.jar
-chill_2.11-0.8.4.jar
+chill-java-0.9.3.jar
+chill_2.11-0.9.3.jar
 commons-beanutils-1.7.0.jar
 commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
@@ -130,7 +130,7 @@ jsr305-1.3.9.jar
 jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
-kryo-shaded-3.0.3.jar
+kryo-shaded-4.0.2.jar
 kubernetes-client-3.0.0.jar
 kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
@@ -149,7 +149,7 @@ metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
-objenesis-2.1.jar
+objenesis-2.5.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar

diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
@@ -27,8 +27,8 @@ breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.4.jar
-chill_2.11-0.8.4.jar
+chill-java-0.9.3.jar
+chill_2.11-0.9.3.jar
 commons-beanutils-1.7.0.jar
 commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
@@ -132,7 +132,7 @@ jsr305-1.3.9.jar
 jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
-kryo-shaded-3.0.3.jar
+kryo-shaded-4.0.2.jar
 kubernetes-client-3.0.0.jar
 kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
@@ -151,7 +151,7 @@ metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
-objenesis-2.1.jar
+objenesis-2.5.1.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar

diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1
@@ -25,8 +25,8 @@ breeze_2.11-0.13.2.jar
 calcite-avatica-1.2.0-incubating.jar
 calcite-core-1.2.0-incubating.jar
 calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.4.jar
-chill_2.11-0.8.4.jar
+chill-java-0.9.3.jar
+chill_2.11-0.9.3.jar
 commons-beanutils-1.9.3.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
@@ -146,7 +146,7 @@ kerby-config-1.0.1.jar
 kerby-pkix-1.0.1.jar
 kerby-util-1.0.1.jar
 kerby-xdr-1.0.1.jar
-kryo-shaded-3.0.3.jar
+kryo-shaded-4.0.2.jar
 kubernetes-client-3.0.0.jar
 kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
@@ -167,7 +167,7 @@ mssql-jdbc-6.2.1.jre7.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 nimbus-jose-jwt-4.41.1.jar
-objenesis-2.1.jar
+objenesis-2.5.1.jar
 okhttp-2.7.5.jar
 okhttp-3.8.1.jar
 okio-1.13.0.jar

diff --git a/docs/tuning.md b/docs/tuning.md
@@ -35,7 +35,7 @@ in your operations) and performance. It provides two serialization libraries:
   Java serialization is flexible but often quite slow, and leads to large
   serialized formats for many classes.
 * [Kryo serialization](https://github.com/EsotericSoftware/kryo): Spark can also use
-  the Kryo library (version 2) to serialize objects more quickly. Kryo is significantly
+  the Kryo library (version 4) to serialize objects more quickly. Kryo is significantly
   faster and more compact than Java serialization (often as much as 10x), but does not support all
   `Serializable` types and requires you to *register* the classes you'll use in the program in advance
   for best performance.

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.ml.regression
 
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.ml.classification.LogisticRegressionSuite._
 import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.feature.{LabeledPoint, RFormula}
@@ -29,6 +29,7 @@ import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.random._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.FloatType
@@ -1687,6 +1688,14 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest
     assert(evalSummary.deviance === summary.deviance)
     assert(evalSummary.aic === summary.aic)
   }
+
+  test("SPARK-23131 Kryo raises StackOverflow during serializing GLR model") {
+    val conf = new SparkConf(false)
+    val ser = new KryoSerializer(conf).newInstance()
+    val trainer = new GeneralizedLinearRegression()
+    val model = trainer.fit(Seq(Instance(1.0, 1.0, Vectors.dense(1.0, 7.0))).toDF)
+    ser.serialize[GeneralizedLinearRegressionModel](model)
+  }
 }
 
 object GeneralizedLinearRegressionSuite {

diff --git a/pom.xml b/pom.xml
@@ -136,7 +136,7 @@
     <hive.parquet.version>1.6.0</hive.parquet.version>
     <jetty.version>9.3.24.v20180605</jetty.version>
     <javaxservlet.version>3.1.0</javaxservlet.version>
-    <chill.version>0.8.4</chill.version>
+    <chill.version>0.9.3</chill.version>
     <ivy.version>2.4.0</ivy.version>
     <oro.version>2.0.8</oro.version>
     <codahale.metrics.version>3.1.5</codahale.metrics.version>
@@ -1770,6 +1770,10 @@
             <groupId>org.apache.hive</groupId>
             <artifactId>hive-storage-api</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId> com.esotericsoftware</groupId>
+            <artifactId>kryo-shaded</artifactId>
+          </exclusion>
         </exclusions>
       </dependency>
       <dependency>