Add arrayElementName option (#603)

databricks · Aug 31, 2022 · e1f2832 · e1f2832
1 parent c529e1f
commit e1f2832
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,7 @@ When writing files the API accepts several options:
 * `rowTag`: The row tag of your xml files to treat as a row. For example, in `<books> <book><book> ...</books>`, the appropriate value would be `book`. Default is `ROW`.
 * `rootTag`: The root tag of your xml files to treat as the root. For example, in `<books> <book><book> ...</books>`, the appropriate value would be `books`. It can include basic attributes by specifying a value like `books foo="bar"` (as of 0.11.0). Default is `ROWS`.
 * `declaration`: Content of XML declaration to write at the start of every output XML file, before the `rootTag`. For example, a value of `foo` causes `<?xml foo?>` to be written. Set to empty string to suppress. Defaults to `version="1.0" encoding="UTF-8" standalone="yes"`. New in 0.14.0.
+* `arrayElementName`: Name of XML element that encloses each element of an array-valued column when writing. Default is `item`. New in 0.16.0.
 * `nullValue`: The value to write `null` value. Default is string `null`. When this is `null`, it does not write attributes and elements for fields.
 * `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`. Cannot be empty for writing XML.
 * `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.

diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
@@ -39,6 +39,8 @@ private[xml] class XmlOptions(
   val declaration = parameters.getOrElse("declaration", XmlOptions.DEFAULT_DECLARATION)
   require(!declaration.startsWith("<") && !declaration.endsWith(">"),
           "'declaration' should not include angle brackets")
+  val arrayElementName = parameters.getOrElse("arrayElementName",
+    XmlOptions.DEFAULT_ARRAY_ELEMENT_NAME)
   val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
   require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
   val excludeAttributeFlag = parameters.get("excludeAttribute").map(_.toBoolean).getOrElse(false)
@@ -71,6 +73,7 @@ private[xml] object XmlOptions {
   val DEFAULT_ROW_TAG = "ROW"
   val DEFAULT_ROOT_TAG = "ROWS"
   val DEFAULT_DECLARATION = "version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\""
+  val DEFAULT_ARRAY_ELEMENT_NAME = "item"
   val DEFAULT_CHARSET: String = StandardCharsets.UTF_8.name
   val DEFAULT_NULL_VALUE: String = null
   val DEFAULT_WILDCARD_COL_NAME = "xs_any"

diff --git a/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala b/src/main/scala/com/databricks/spark/xml/parsers/StaxXmlGenerator.scala
@@ -104,10 +104,10 @@ private[xml] object StaxXmlGenerator {
       // [[ArrayType]] as element type. It always wraps the element with [[StructType]]. So,
       // this case only can happen when we convert a normal [[DataFrame]] to XML file.
       // When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is element name
-      // for XML file. Now, it is "item" but this might have to be according the parent field name.
+      // for XML file.
       case (ArrayType(ty, _), v: scala.collection.Seq[_]) =>
         v.foreach { e =>
-          writeChild("item", ty, e)
+          writeChild(options.arrayElementName, ty, e)
         }
 
       case (MapType(_, vt, _), mv: Map[_, _]) =>

diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -387,6 +387,13 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>")
   }
 
+  test("DSL save with item") {
+    val tempPath = getEmptyTempDir().resolve("items-temp.xml")
+    val items = spark.createDataFrame(Seq(Tuple1(Array(Array(3, 4))))).toDF("thing").repartition(1)
+    items.write.option("arrayElementName", "foo").xml(tempPath.toString)
+    assert(getLines(tempPath.resolve("part-00000")).count(_.contains("<foo>")) === 2)
+  }
+
   test("DSL save with nullValue and treatEmptyValuesAsNulls") {
     val copyFilePath = getEmptyTempDir().resolve("books-copy.xml")
 
@@ -443,11 +450,11 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
     df.write.xml(copyFilePath.toString)
 
     // When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is the element
-    // name for XML file. Now, it is "item". So, "item" field is additionally added
+    // name for XML file. Now, it is "item" by default. So, "item" field is additionally added
     // to wrap the element.
     val schemaCopy = buildSchema(
       structArray("a",
-        field("item", ArrayType(StringType))))
+        field(XmlOptions.DEFAULT_ARRAY_ELEMENT_NAME, ArrayType(StringType))))
     val dfCopy = spark.read.xml(copyFilePath.toString)
 
     assert(dfCopy.count() === df.count())