Skip to content

Commit

Permalink
Add arrayElementName option (#603)
Browse files Browse the repository at this point in the history
  • Loading branch information
srowen authored Aug 31, 2022
1 parent c529e1f commit e1f2832
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ When writing files the API accepts several options:
* `rowTag`: The row tag of your xml files to treat as a row. For example, in `<books> <book><book> ...</books>`, the appropriate value would be `book`. Default is `ROW`.
* `rootTag`: The root tag of your xml files to treat as the root. For example, in `<books> <book><book> ...</books>`, the appropriate value would be `books`. It can include basic attributes by specifying a value like `books foo="bar"` (as of 0.11.0). Default is `ROWS`.
* `declaration`: Content of XML declaration to write at the start of every output XML file, before the `rootTag`. For example, a value of `foo` causes `<?xml foo?>` to be written. Set to empty string to suppress. Defaults to `version="1.0" encoding="UTF-8" standalone="yes"`. New in 0.14.0.
* `arrayElementName`: Name of XML element that encloses each element of an array-valued column when writing. Default is `item`. New in 0.16.0.
* `nullValue`: The value to write `null` value. Default is string `null`. When this is `null`, it does not write attributes and elements for fields.
* `attributePrefix`: The prefix for attributes so that we can differentiating attributes and elements. This will be the prefix for field names. Default is `_`. Cannot be empty for writing XML.
* `valueTag`: The tag used for the value when there are attributes in the element having no child. Default is `_VALUE`.
Expand Down
3 changes: 3 additions & 0 deletions src/main/scala/com/databricks/spark/xml/XmlOptions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ private[xml] class XmlOptions(
val declaration = parameters.getOrElse("declaration", XmlOptions.DEFAULT_DECLARATION)
require(!declaration.startsWith("<") && !declaration.endsWith(">"),
"'declaration' should not include angle brackets")
val arrayElementName = parameters.getOrElse("arrayElementName",
XmlOptions.DEFAULT_ARRAY_ELEMENT_NAME)
val samplingRatio = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
require(samplingRatio > 0, s"samplingRatio ($samplingRatio) should be greater than 0")
val excludeAttributeFlag = parameters.get("excludeAttribute").map(_.toBoolean).getOrElse(false)
Expand Down Expand Up @@ -71,6 +73,7 @@ private[xml] object XmlOptions {
val DEFAULT_ROW_TAG = "ROW"
val DEFAULT_ROOT_TAG = "ROWS"
val DEFAULT_DECLARATION = "version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\""
val DEFAULT_ARRAY_ELEMENT_NAME = "item"
val DEFAULT_CHARSET: String = StandardCharsets.UTF_8.name
val DEFAULT_NULL_VALUE: String = null
val DEFAULT_WILDCARD_COL_NAME = "xs_any"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ private[xml] object StaxXmlGenerator {
// [[ArrayType]] as element type. It always wraps the element with [[StructType]]. So,
// this case only can happen when we convert a normal [[DataFrame]] to XML file.
// When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is element name
// for XML file. Now, it is "item" but this might have to be according the parent field name.
// for XML file.
case (ArrayType(ty, _), v: scala.collection.Seq[_]) =>
v.foreach { e =>
writeChild("item", ty, e)
writeChild(options.arrayElementName, ty, e)
}

case (MapType(_, vt, _), mv: Map[_, _]) =>
Expand Down
11 changes: 9 additions & 2 deletions src/test/scala/com/databricks/spark/xml/XmlSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,13 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>")
}

test("DSL save with item") {
val tempPath = getEmptyTempDir().resolve("items-temp.xml")
val items = spark.createDataFrame(Seq(Tuple1(Array(Array(3, 4))))).toDF("thing").repartition(1)
items.write.option("arrayElementName", "foo").xml(tempPath.toString)
assert(getLines(tempPath.resolve("part-00000")).count(_.contains("<foo>")) === 2)
}

test("DSL save with nullValue and treatEmptyValuesAsNulls") {
val copyFilePath = getEmptyTempDir().resolve("books-copy.xml")

Expand Down Expand Up @@ -443,11 +450,11 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
df.write.xml(copyFilePath.toString)

// When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what is the element
// name for XML file. Now, it is "item". So, "item" field is additionally added
// name for XML file. Now, it is "item" by default. So, "item" field is additionally added
// to wrap the element.
val schemaCopy = buildSchema(
structArray("a",
field("item", ArrayType(StringType))))
field(XmlOptions.DEFAULT_ARRAY_ELEMENT_NAME, ArrayType(StringType))))
val dfCopy = spark.read.xml(copyFilePath.toString)

assert(dfCopy.count() === df.count())
Expand Down

0 comments on commit e1f2832

Please sign in to comment.