databricks · JorisTruong · Dec 21, 2022 · srowen · Dec 22, 2022 · srowen
diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala
@@ -64,6 +64,7 @@ private[xml] class XmlOptions(
     parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME)
   val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false)
   val timestampFormat = parameters.get("timestampFormat")
+  val timeZone = parameters.getOrElse("timeZone", XmlOptions.DEFAULT_TIME_ZONE)
   val dateFormat = parameters.get("dateFormat")
 }
 
@@ -77,6 +78,7 @@ private[xml] object XmlOptions {
   val DEFAULT_CHARSET: String = StandardCharsets.UTF_8.name
   val DEFAULT_NULL_VALUE: String = null
   val DEFAULT_WILDCARD_COL_NAME = "xs_any"
+  val DEFAULT_TIME_ZONE = "UTC"
 
   def apply(parameters: Map[String, String]): XmlOptions = new XmlOptions(parameters)
 }
diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala
@@ -115,11 +115,14 @@ private[xml] object TypeCast {
   )
 
   private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = {
+    val timeZone = options.timeZone
     val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern).
       map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters)
     formatters.foreach { format =>
       try {
-        return Timestamp.from(ZonedDateTime.parse(value, format).toInstant)
+        return Timestamp.from(
+          ZonedDateTime.parse(value, format.withZone(ZoneId.of(timeZone))).toInstant
+        )
       } catch {
         case _: Exception => // continue
       }

diff --git a/src/test/resources/time.xml b/src/test/resources/time.xml
@@ -2,4 +2,5 @@
     <author>John Smith</author>
     <time>2011-12-03T10:15:30Z</time>
     <time2>12-03-2011 10:15:30 PST</time2>
+    <time3>2011/12/03 06:15:30</time3>
 </book>
diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala
@@ -1357,7 +1357,12 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
       .option("rowTag", "book")
       .xml(resDir + "time.xml")
     val expectedSchema =
-      buildSchema(field("author"), field("time", TimestampType), field("time2", StringType))
+      buildSchema(
+        field("author"),
+        field("time", TimestampType),
+        field("time2", StringType),
+        field("time3", StringType)
+      )
     assert(df.schema === expectedSchema)
     assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L)
   }
@@ -1379,11 +1384,32 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll {
       .option("timestampFormat", "MM-dd-yyyy HH:mm:ss z")
       .xml(resDir + "time.xml")
     val expectedSchema =
-      buildSchema(field("author"), field("time", TimestampType), field("time2", TimestampType))
+      buildSchema(
+        field("author"),
+        field("time", TimestampType),
+        field("time2", TimestampType),
+        field("time3", StringType)
+      )
     assert(df.schema === expectedSchema)
     assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L)
   }
 
+  test("Test custom timestampFormat") {
+    val df = spark.read
+      .option("rowTag", "book")
+      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss")
+      .xml(resDir + "time.xml")
+    val expectedSchema =
+      buildSchema(
+        field("author"),
+        field("time", TimestampType),
+        field("time2", StringType),
+        field("time3", TimestampType)
+      )
+    assert(df.schema === expectedSchema)
+    assert(df.collect().head.getAs[Timestamp](3).getTime === 1322892930000L)
+  }
+
   test("Test null number type is null not 0.0") {
     val schema = buildSchema(
       struct("Header",