diff --git a/.github/workflows/test_spark_2_java_8.yml b/.github/workflows/test_spark_3_2_java_11.yml similarity index 61% rename from .github/workflows/test_spark_2_java_8.yml rename to .github/workflows/test_spark_3_2_java_11.yml index 0e1a51a8..fce7afa6 100644 --- a/.github/workflows/test_spark_2_java_8.yml +++ b/.github/workflows/test_spark_3_2_java_11.yml @@ -1,4 +1,4 @@ -name: Spark 2 / Java 8 +name: Spark 3.2 / Java 11 / Scala 2.13 on: push: branches: [master] @@ -12,6 +12,6 @@ jobs: - name: Set up Java, SBT uses: olafurpg/setup-scala@v11 with: - java-version: 'adopt@1.8' + java-version: 'adopt@1.11' - name: Build and test - run: sbt -Dspark.testVersion=2.4.8 ++2.11.12 clean scalastyle test:scalastyle mimaReportBinaryIssues test + run: sbt -Dspark.testVersion=3.2.0 ++2.13.5 clean test diff --git a/.github/workflows/test_spark_3_java_11.yml b/.github/workflows/test_spark_3_java_8.yml similarity index 67% rename from .github/workflows/test_spark_3_java_11.yml rename to .github/workflows/test_spark_3_java_8.yml index 45637c3c..8c25142f 100644 --- a/.github/workflows/test_spark_3_java_11.yml +++ b/.github/workflows/test_spark_3_java_8.yml @@ -1,4 +1,4 @@ -name: Spark 3 / Java 11 +name: Spark 3 / Java 8 / Scala 2.12 on: push: branches: [master] @@ -12,8 +12,8 @@ jobs: - name: Set up Java, SBT uses: olafurpg/setup-scala@v11 with: - java-version: 'adopt@1.11' + java-version: 'adopt@1.8' - name: Build and test - run: sbt -Dspark.testVersion=3.1.2 ++2.12.10 clean scalastyle test:scalastyle mimaReportBinaryIssues coverage test coverageReport + run: sbt -Dspark.testVersion=3.0.3 ++2.12.10 clean scalastyle test:scalastyle mimaReportBinaryIssues coverage test coverageReport - name: Check code coverage - run: bash <(curl -s https://codecov.io/bash) + run: bash <(curl -s https://codecov.io/bash) \ No newline at end of file diff --git a/README.md b/README.md index 53fe86d7..21cf9bbd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ The structure and test tools are mostly copied from [CSV Data Source for Spark]( - This package supports to process format-free XML files in a distributed way, unlike JSON datasource in Spark restricts in-line JSON format. -- Compatible with Spark 2.4.x and 3.x, with Scala 2.12. Scala 2.11 support with Spark 2.4.x is deprecated. +- Compatible with Spark 3.0 and later with Scala 2.12, and also Spark 3.2 and later with Scala 2.12 or 2.13. Scala 2.11 and Spark 2 support ended with version 0.13.0. ## Linking @@ -16,7 +16,7 @@ You can link against this library in your program at the following coordinates: ``` groupId: com.databricks artifactId: spark-xml_2.12 -version: 0.13.0 +version: 0.14.0 ``` ## Using with Spark shell @@ -24,7 +24,7 @@ version: 0.13.0 This package can be added to Spark using the `--packages` command line option. For example, to include it when starting the spark shell: ``` -$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.12:0.13.0 +$SPARK_HOME/bin/spark-shell --packages com.databricks:spark-xml_2.12:0.14.0 ``` ## Features @@ -399,7 +399,7 @@ Automatically infer schema (data types) ```R library(SparkR) -sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.13.0")) +sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.14.0")) df <- read.df("books.xml", source = "xml", rowTag = "book") @@ -411,7 +411,7 @@ You can manually specify schema: ```R library(SparkR) -sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.13.0")) +sparkR.session("local[4]", sparkPackages = c("com.databricks:spark-xml_2.12:0.14.0")) customSchema <- structType( structField("_id", "string"), structField("author", "string"), diff --git a/build.sbt b/build.sbt index 7c415993..e39b6a8e 100755 --- a/build.sbt +++ b/build.sbt @@ -1,16 +1,16 @@ name := "spark-xml" -version := "0.13.0" +version := "0.14.0" organization := "com.databricks" scalaVersion := "2.12.10" -crossScalaVersions := Seq("2.11.12", "2.12.10") +crossScalaVersions := Seq("2.12.10", "2.13.5") scalacOptions := Seq("-unchecked", "-deprecation") -val sparkVersion = sys.props.get("spark.testVersion").getOrElse("2.4.8") +val sparkVersion = sys.props.get("spark.testVersion").getOrElse("3.2.0") // To avoid packaging it, it's Provided below autoScalaLibrary := false diff --git a/src/main/scala/com/databricks/spark/xml/XmlDataToCatalyst.scala b/src/main/scala/com/databricks/spark/xml/XmlDataToCatalyst.scala index 6927d766..cc5cd796 100644 --- a/src/main/scala/com/databricks/spark/xml/XmlDataToCatalyst.scala +++ b/src/main/scala/com/databricks/spark/xml/XmlDataToCatalyst.scala @@ -57,4 +57,7 @@ case class XmlDataToCatalyst( case _: StructType => Seq(StringType) case ArrayType(_: StructType, _) => Seq(ArrayType(StringType)) } + + // Overrides, in Spark 3.2.0+ + protected def withNewChildInternal(newChild: Expression): XmlDataToCatalyst = copy(newChild) } diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala index 49947cac..5629b2f3 100644 --- a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala +++ b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala @@ -66,7 +66,7 @@ private[xml] object StaxXmlGenerator { writer.writeAttribute(name.substring(options.attributePrefix.length), v.toString) // For ArrayType, we just need to write each as XML element. - case (ArrayType(ty, _), v: Seq[_]) => + case (ArrayType(ty, _), v: scala.collection.Seq[_]) => v.foreach { e => writeChildElement(name, ty, e) } @@ -101,7 +101,7 @@ private[xml] object StaxXmlGenerator { // this case only can happen when we convert a normal [[DataFrame]] to XML file. // When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is element name // for XML file. Now, it is "item" but this might have to be according the parent field name. - case (ArrayType(ty, _), v: Seq[_]) => + case (ArrayType(ty, _), v: scala.collection.Seq[_]) => v.foreach { e => writeChild("item", ty, e) } diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index df90e6a6..c556512a 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -890,17 +890,17 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { val resultsOne = spark.read .option("treatEmptyValuesAsNulls", "true") .xml(resDir + "gps-empty-field.xml") - assert(resultsOne.selectExpr("extensions.TrackPointExtension").head.getStruct(0) !== null) + assert(resultsOne.selectExpr("extensions.TrackPointExtension").head().getStruct(0) !== null) assert(resultsOne.selectExpr("extensions.TrackPointExtension") - .head.getStruct(0)(0) === null) + .head().getStruct(0)(0) === null) // Is the behavior below consistent? see line above. - assert(resultsOne.selectExpr("extensions.TrackPointExtension.hr").head.getStruct(0) === null) + assert(resultsOne.selectExpr("extensions.TrackPointExtension.hr").head().getStruct(0) === null) assert(resultsOne.collect().length === 2) val resultsTwo = spark.read .option("nullValue", "2013-01-24T06:18:43Z") .xml(resDir + "gps-empty-field.xml") - assert(resultsTwo.selectExpr("time").head.getStruct(0) === null) + assert(resultsTwo.selectExpr("time").head().getStruct(0) === null) assert(resultsTwo.collect().length === 2) }