From b672d80513cfc5d68151fefb873bd9888f79260b Mon Sep 17 00:00:00 2001 From: Joris Truong Date: Mon, 26 Dec 2022 10:12:01 +0100 Subject: [PATCH 01/16] feat: allow custom timestamp with spark timezone --- README.md | 4 +-- .../databricks/spark/xml/DefaultSource.scala | 3 +- .../com/databricks/spark/xml/XmlOptions.scala | 1 + .../databricks/spark/xml/util/TypeCast.scala | 10 ++++++- src/test/resources/time.xml | 1 + .../com/databricks/spark/xml/XmlSuite.scala | 30 +++++++++++++++++-- 6 files changed, 43 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 54cd51a2..d66650f4 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Defaults to `false`. New in 0.11.0. * `timestampFormat`: Specifies an additional timestamp format that will be tried when parsing values as `TimestampType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT), -including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. +including variations with offset timezones or no timezone (defaults to UTC). If a custom format is used, it will check if it has a timezone in the pattern. If not, it will uses the default spark timezone from `spark.sql.session.timeZone`. New in 0.12.0. * `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. @@ -83,7 +83,7 @@ When writing files the API accepts several options: * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified. * `timestampFormat`: Controls the format used to write `TimestampType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). -Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. +Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). If a custom format is used, it will check if it has a timezone in the pattern. If not, it will uses the default spark timezone from `spark.sql.session.timeZone`. New in 0.12.0. * `dateFormat`: Controls the format used to write `DateType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index 198bf208..b5a45257 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -70,7 +70,8 @@ class DefaultSource XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), - parameters, + parameters + ("timezone" -> + sqlContext.sparkContext.getConf.get("spark.sql.session.timeZone")), schema)(sqlContext) } diff --git a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala index 21994fcb..299a999e 100644 --- a/src/main/scala/com/databricks/spark/xml/XmlOptions.scala +++ b/src/main/scala/com/databricks/spark/xml/XmlOptions.scala @@ -64,6 +64,7 @@ private[xml] class XmlOptions( parameters.getOrElse("wildcardColName", XmlOptions.DEFAULT_WILDCARD_COL_NAME) val ignoreNamespace = parameters.get("ignoreNamespace").map(_.toBoolean).getOrElse(false) val timestampFormat = parameters.get("timestampFormat") + val timezone = parameters.get("timezone") val dateFormat = parameters.get("dateFormat") } diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 49882f9c..1f110166 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -119,7 +119,15 @@ private[xml] object TypeCast { map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) formatters.foreach { format => try { - return Timestamp.from(ZonedDateTime.parse(value, format).toInstant) + // If format is not in supported and no timezone in format, use default Spark timezone + if (!supportedXmlTimestampFormatters.contains(format) && Option(format.getZone).isEmpty) { + return Timestamp.from( + ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant + ) + } + return Timestamp.from( + ZonedDateTime.parse(value, format).toInstant + ) } catch { case _: Exception => // continue } diff --git a/src/test/resources/time.xml b/src/test/resources/time.xml index ee0609c7..2e0d9de0 100644 --- a/src/test/resources/time.xml +++ b/src/test/resources/time.xml @@ -2,4 +2,5 @@ John Smith 12-03-2011 10:15:30 PST + 2011/12/03 06:15:30 \ No newline at end of file diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index e5d34e71..642ded06 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -1357,7 +1357,12 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { .option("rowTag", "book") .xml(resDir + "time.xml") val expectedSchema = - buildSchema(field("author"), field("time", TimestampType), field("time2", StringType)) + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", StringType) + ) assert(df.schema === expectedSchema) assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) } @@ -1379,11 +1384,32 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { .option("timestampFormat", "MM-dd-yyyy HH:mm:ss z") .xml(resDir + "time.xml") val expectedSchema = - buildSchema(field("author"), field("time", TimestampType), field("time2", TimestampType)) + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", TimestampType), + field("time3", StringType) + ) assert(df.schema === expectedSchema) assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L) } + test("Test custom timestampFormat") { + val df = spark.read + .option("rowTag", "book") + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss") + .xml(resDir + "time.xml") + val expectedSchema = + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", TimestampType) + ) + assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](3).getTime === 1322892930000L) + } + test("Test null number type is null not 0.0") { val schema = buildSchema( struct("Header", From 351b2aa4853d52db585020d87b50f742cf798b11 Mon Sep 17 00:00:00 2001 From: joristruong Date: Thu, 29 Dec 2022 11:57:29 +0800 Subject: [PATCH 02/16] docs: updated README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d66650f4..49e5e6d5 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Defaults to `false`. New in 0.11.0. * `timestampFormat`: Specifies an additional timestamp format that will be tried when parsing values as `TimestampType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to try several formats, including [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT), -including variations with offset timezones or no timezone (defaults to UTC). If a custom format is used, it will check if it has a timezone in the pattern. If not, it will uses the default spark timezone from `spark.sql.session.timeZone`. New in 0.12.0. +including variations with offset timezones or no timezone (defaults to UTC). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used. * `dateFormat`: Specifies an additional timestamp format that will be tried when parsing values as `DateType` columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. @@ -83,7 +83,7 @@ When writing files the API accepts several options: * `compression`: compression codec to use when saving to file. Should be the fully qualified name of a class implementing `org.apache.hadoop.io.compress.CompressionCodec` or one of case-insensitive shorten names (`bzip2`, `gzip`, `lz4`, and `snappy`). Defaults to no compression when a codec is not specified. * `timestampFormat`: Controls the format used to write `TimestampType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). -Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). If a custom format is used, it will check if it has a timezone in the pattern. If not, it will uses the default spark timezone from `spark.sql.session.timeZone`. New in 0.12.0. +Defaults to [ISO_INSTANT](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_INSTANT). New in 0.12.0. As of 0.16.0, if a custom format pattern is used without a timezone, the default Spark timezone specified by `spark.sql.session.timeZone` will be used. * `dateFormat`: Controls the format used to write `DateType` format columns. The format is specified as described in [DateTimeFormatter](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html). Defaults to [ISO_DATE](https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html#ISO_DATE). New in 0.12.0. From aa65f282f1e27f0a377a0c306da32eb2466ffb3f Mon Sep 17 00:00:00 2001 From: joristruong Date: Thu, 29 Dec 2022 16:54:43 +0800 Subject: [PATCH 03/16] fix: ability to run without setting spark.sql.session.timeZone --- .../databricks/spark/xml/DefaultSource.scala | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index b5a45257..04772d0e 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -67,12 +67,23 @@ class DefaultSource (options.charset, options.rowTag) } - XmlRelation( - () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), - Some(path), - parameters + ("timezone" -> - sqlContext.sparkContext.getConf.get("spark.sql.session.timeZone")), - schema)(sqlContext) + val sparkTimezone = sqlContext.sparkContext.getConf.getOption( + "spark.sql.session.timeZone" + ) + + if (sparkTimezone.isDefined) { + XmlRelation( + () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), + Some(path), + parameters + ("timezone" -> sparkTimezone.get), + schema)(sqlContext) + } else { + XmlRelation( + () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), + Some(path), + parameters, + schema)(sqlContext) + } } override def createRelation( From 53279a7830fe9f690631c4e851ffe490cb2dced5 Mon Sep 17 00:00:00 2001 From: joristruong Date: Thu, 29 Dec 2022 16:55:33 +0800 Subject: [PATCH 04/16] feat: break parseXmlTimestamp method into built-in formats and custom format processing --- .../databricks/spark/xml/util/TypeCast.scala | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 1f110166..7f248e0f 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -115,20 +115,33 @@ private[xml] object TypeCast { ) private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { - val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern). - map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) - formatters.foreach { format => + // Loop over built-in formats + supportedXmlTimestampFormatters.foreach { format => + try { + return Timestamp.from( + ZonedDateTime.parse(value, format).toInstant + ) + } catch { + case _: Exception => // continue + } + } + // Custom format + if (options.timestampFormat.isDefined) { try { - // If format is not in supported and no timezone in format, use default Spark timezone - if (!supportedXmlTimestampFormatters.contains(format) && Option(format.getZone).isEmpty) { + val format = DateTimeFormatter.ofPattern(options.timestampFormat.get) + // Custom format with timezone + if (Option(format.getZone).isDefined) { + return Timestamp.from( + ZonedDateTime.parse(value, format).toInstant + ) + } else { + // Custom format without timezone return Timestamp.from( ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant ) } - return Timestamp.from( - ZonedDateTime.parse(value, format).toInstant - ) } catch { + case _: NoSuchElementException => throw new NoSuchElementException("test") case _: Exception => // continue } } From b5435adbb93dfe45ad8dd3239e200a2ef94793d5 Mon Sep 17 00:00:00 2001 From: joristruong Date: Thu, 29 Dec 2022 17:03:49 +0800 Subject: [PATCH 05/16] fix: removed unused code --- src/main/scala/com/databricks/spark/xml/util/TypeCast.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 7f248e0f..e6d56061 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -141,7 +141,6 @@ private[xml] object TypeCast { ) } } catch { - case _: NoSuchElementException => throw new NoSuchElementException("test") case _: Exception => // continue } } From 8f483686b50f94a06bc3a26416955932cc4fadc4 Mon Sep 17 00:00:00 2001 From: joristruong Date: Fri, 30 Dec 2022 01:09:45 +0800 Subject: [PATCH 06/16] fix: timestampFormat with offset should not use spark.sql.session.timezone --- .../databricks/spark/xml/util/TypeCast.scala | 25 ++++++++------- src/test/resources/time.xml | 1 + .../com/databricks/spark/xml/XmlSuite.scala | 31 ++++++++++++++++--- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index e6d56061..e817737b 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -127,19 +127,20 @@ private[xml] object TypeCast { } // Custom format if (options.timestampFormat.isDefined) { + val format = DateTimeFormatter.ofPattern(options.timestampFormat.get) try { - val format = DateTimeFormatter.ofPattern(options.timestampFormat.get) - // Custom format with timezone - if (Option(format.getZone).isDefined) { - return Timestamp.from( - ZonedDateTime.parse(value, format).toInstant - ) - } else { - // Custom format without timezone - return Timestamp.from( - ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant - ) - } + // Custom format with timezone or offset + return Timestamp.from( + ZonedDateTime.parse(value, format).toInstant + ) + } catch { + case _: Exception => // continue + } + try { + // Custom format without timezone or offset + return Timestamp.from( + ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant + ) } catch { case _: Exception => // continue } diff --git a/src/test/resources/time.xml b/src/test/resources/time.xml index 2e0d9de0..0374d3e4 100644 --- a/src/test/resources/time.xml +++ b/src/test/resources/time.xml @@ -3,4 +3,5 @@ 12-03-2011 10:15:30 PST 2011/12/03 06:15:30 + 2011/12/03 16:15:30 +1000 \ No newline at end of file diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 642ded06..1130c80e 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -1361,7 +1361,8 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { field("author"), field("time", TimestampType), field("time2", StringType), - field("time3", StringType) + field("time3", StringType), + field("time4", StringType) ) assert(df.schema === expectedSchema) assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) @@ -1388,13 +1389,15 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { field("author"), field("time", TimestampType), field("time2", TimestampType), - field("time3", StringType) + field("time3", StringType), + field("time4", StringType), ) assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) assert(df.collect().head.getAs[Timestamp](2).getTime === 1322936130000L) } - test("Test custom timestampFormat") { + test("Test custom timestampFormat without timezone") { val df = spark.read .option("rowTag", "book") .option("timestampFormat", "yyyy/MM/dd HH:mm:ss") @@ -1404,12 +1407,32 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { field("author"), field("time", TimestampType), field("time2", StringType), - field("time3", TimestampType) + field("time3", TimestampType), + field("time4", StringType) ) assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) assert(df.collect().head.getAs[Timestamp](3).getTime === 1322892930000L) } + test("Test custom timestampFormat with offset") { + val df = spark.read + .option("rowTag", "book") + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss Z") + .xml(resDir + "time.xml") + val expectedSchema = + buildSchema( + field("author"), + field("time", TimestampType), + field("time2", StringType), + field("time3", StringType), + field("time4", TimestampType) + ) + assert(df.schema === expectedSchema) + assert(df.collect().head.getAs[Timestamp](1).getTime === 1322907330000L) + assert(df.collect().head.getAs[Timestamp](4).getTime === 1322892930000L) + } + test("Test null number type is null not 0.0") { val schema = buildSchema( struct("Header", From 375f7ea6dcab6127b0e1c3c0a3ae55db71436c6e Mon Sep 17 00:00:00 2001 From: joristruong Date: Fri, 30 Dec 2022 16:11:03 +0800 Subject: [PATCH 07/16] fix: ISO_INSTANT --- src/main/scala/com/databricks/spark/xml/util/TypeCast.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 23134738..afb97d2a 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -112,7 +112,7 @@ private[xml] object TypeCast { // 2002-05-30T21:46:54+06:00 DateTimeFormatter.ISO_OFFSET_DATE_TIME, // 2002-05-30T21:46:54.1234Z - DateTimeFormatter.ISO_INSTANT + DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.of("UTC")) ) private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { From 869558f0565b73aec1eaab5ec99cf04f655cec3b Mon Sep 17 00:00:00 2001 From: joristruong Date: Fri, 30 Dec 2022 16:14:57 +0800 Subject: [PATCH 08/16] feat: added isParseableAsZonedDateTime --- .../databricks/spark/xml/util/TypeCast.scala | 53 +++++++++--------- .../spark/xml/util/TypeCastSuite.scala | 55 +++++++++++++++++++ 2 files changed, 82 insertions(+), 26 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index afb97d2a..e297882d 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -17,9 +17,10 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} -import java.text.NumberFormat +import java.text.{NumberFormat, ParsePosition} import java.time.{LocalDate, ZoneId, ZonedDateTime} import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} +import java.time.temporal.TemporalQueries import java.util.Locale import scala.util.Try @@ -116,32 +117,19 @@ private[xml] object TypeCast { ) private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { - // Loop over built-in formats - supportedXmlTimestampFormatters.foreach { format => - try { - return Timestamp.from( - ZonedDateTime.parse(value, format).toInstant - ) - } catch { - case _: Exception => // continue - } - } - // Custom format - if (options.timestampFormat.isDefined) { - val format = DateTimeFormatter.ofPattern(options.timestampFormat.get) - try { - // Custom format with timezone or offset - return Timestamp.from( - ZonedDateTime.parse(value, format).toInstant - ) - } catch { - case _: Exception => // continue - } + val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern). + map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) + formatters.foreach { format => try { - // Custom format without timezone or offset - return Timestamp.from( - ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant - ) + if (isParseableAsZonedDateTime(value, format)) { + return Timestamp.from( + ZonedDateTime.parse(value, format).toInstant + ) + } else { + return Timestamp.from( + ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant + ) + } } catch { case _: Exception => // continue } @@ -289,4 +277,17 @@ private[xml] object TypeCast { TypeCast.castTo(data, FloatType, options).asInstanceOf[Float] } } + + private[xml] def isParseableAsZonedDateTime(value: String, + formatter: DateTimeFormatter): Boolean = { + val pos = new ParsePosition(0) + val temporalAccessor = formatter.parseUnresolved(value, pos) + // Checks if there is error in parsing + val parseable = pos.getErrorIndex < 0 && pos.getIndex >= value.length + // Checks if has zone, offset or timezone information + val hasTemporalInformation = (temporalAccessor != null && + temporalAccessor.query(TemporalQueries.zone()) != null) || + formatter.getZone != null + parseable && hasTemporalInformation + } } diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala index fbc7885c..220ec7ec 100644 --- a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala @@ -18,12 +18,14 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} import java.time.{ZoneId, ZonedDateTime} +import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.util.Locale import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.sql.types._ import com.databricks.spark.xml.XmlOptions +import com.databricks.spark.xml.util.TypeCast.isParseableAsZonedDateTime final class TypeCastSuite extends AnyFunSuite { @@ -162,4 +164,57 @@ final class TypeCastSuite extends AnyFunSuite { Locale.setDefault(defaultLocale) } } + + test("Test if string is parseable as a timestamp") { + val supportedXmlTimestampFormatters = Seq( + // 2002-05-30 21:46:54 + new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral(' ') + .append(DateTimeFormatter.ISO_LOCAL_TIME) + .toFormatter() + .withZone(ZoneId.of("UTC")), + // 2002-05-30T21:46:54 + DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("UTC")), + // 2002-05-30T21:46:54+06:00 + DateTimeFormatter.ISO_OFFSET_DATE_TIME, + // 2002-05-30T21:46:54.1234Z + DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.of("UTC")) + ) + + val supportedXmlTimestamps = Seq( + "2002-05-30 21:46:54", + "2002-05-30T21:46:54", + "2002-05-30T21:46:54+06:00", + "2002-05-30T21:46:54.1234Z" + ) + + val checkBuiltInTimestamps = supportedXmlTimestampFormatters.zip(supportedXmlTimestamps) + + checkBuiltInTimestamps.foreach { case(format, value) => + assert(isParseableAsZonedDateTime(value, format)) + } + + assert(isParseableAsZonedDateTime( + "12-03-2011 10:15:30 PST", + DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss z") + )) + assert(isParseableAsZonedDateTime( + "2011/12/03 16:15:30 +1000", + DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss Z") + )) + assert(!isParseableAsZonedDateTime( + "2011/12/03 16:15:30", + DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss") + )) + assert(!isParseableAsZonedDateTime( + "12-03-2011 10:15:30 PS", + DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss z") + )) + assert(!isParseableAsZonedDateTime( + "12-03-2011 10:15:30 PST", + DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss") + )) + } } From 8b53bbba03375a275e6836ea32c834b2b657e9df Mon Sep 17 00:00:00 2001 From: joristruong Date: Fri, 30 Dec 2022 22:57:13 +0800 Subject: [PATCH 09/16] refactor: isParseableAsZonedDateTime and Spark timeZone --- .../databricks/spark/xml/DefaultSource.scala | 19 ++++++------------- .../databricks/spark/xml/util/TypeCast.scala | 11 ++++------- .../spark/xml/util/TypeCastSuite.scala | 6 ++---- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index 04772d0e..60ecafd8 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -71,19 +71,12 @@ class DefaultSource "spark.sql.session.timeZone" ) - if (sparkTimezone.isDefined) { - XmlRelation( - () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), - Some(path), - parameters + ("timezone" -> sparkTimezone.get), - schema)(sqlContext) - } else { - XmlRelation( - () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), - Some(path), - parameters, - schema)(sqlContext) - } + XmlRelation( + () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), + Some(path), + if (sparkTimezone.isDefined) parameters + ("timezone" -> sparkTimezone.get) + else parameters, + schema)(sqlContext) } override def createRelation( diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index e297882d..e108f555 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -121,15 +121,12 @@ private[xml] object TypeCast { map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) formatters.foreach { format => try { - if (isParseableAsZonedDateTime(value, format)) { - return Timestamp.from( - ZonedDateTime.parse(value, format).toInstant - ) + val extendedFormat = if (isParseableAsZonedDateTime(value, format)) { + format } else { - return Timestamp.from( - ZonedDateTime.parse(value, format.withZone(ZoneId.of(options.timezone.get))).toInstant - ) + format.withZone(ZoneId.of(options.timezone.get)) } + return Timestamp.from(ZonedDateTime.parse(value, extendedFormat).toInstant) } catch { case _: Exception => // continue } diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala index 220ec7ec..8b739cf8 100644 --- a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala @@ -179,15 +179,12 @@ final class TypeCastSuite extends AnyFunSuite { DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("UTC")), // 2002-05-30T21:46:54+06:00 DateTimeFormatter.ISO_OFFSET_DATE_TIME, - // 2002-05-30T21:46:54.1234Z - DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.of("UTC")) ) val supportedXmlTimestamps = Seq( "2002-05-30 21:46:54", "2002-05-30T21:46:54", - "2002-05-30T21:46:54+06:00", - "2002-05-30T21:46:54.1234Z" + "2002-05-30T21:46:54+06:00" ) val checkBuiltInTimestamps = supportedXmlTimestampFormatters.zip(supportedXmlTimestamps) @@ -195,6 +192,7 @@ final class TypeCastSuite extends AnyFunSuite { checkBuiltInTimestamps.foreach { case(format, value) => assert(isParseableAsZonedDateTime(value, format)) } + sys.exit() assert(isParseableAsZonedDateTime( "12-03-2011 10:15:30 PST", From ab97af25711c5e4892e60c26067225a54fa401ae Mon Sep 17 00:00:00 2001 From: joristruong Date: Fri, 30 Dec 2022 23:37:12 +0800 Subject: [PATCH 10/16] fix: removed sys.exit --- src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala index 8b739cf8..50ddc66f 100644 --- a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala @@ -192,7 +192,6 @@ final class TypeCastSuite extends AnyFunSuite { checkBuiltInTimestamps.foreach { case(format, value) => assert(isParseableAsZonedDateTime(value, format)) } - sys.exit() assert(isParseableAsZonedDateTime( "12-03-2011 10:15:30 PST", From 124a90acfe674c78d441aacb8ed8d9f94d207ebb Mon Sep 17 00:00:00 2001 From: joristruong Date: Sat, 31 Dec 2022 11:27:08 +0800 Subject: [PATCH 11/16] feat: use Instant.from() instead of converting a ZonedDateTime to an Instant --- .../databricks/spark/xml/DefaultSource.scala | 9 +- .../databricks/spark/xml/util/TypeCast.scala | 40 +++---- .../spark/xml/util/TypeCastSuite.scala | 105 ++++++++++-------- 3 files changed, 79 insertions(+), 75 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index 60ecafd8..a74345b9 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -67,15 +67,14 @@ class DefaultSource (options.charset, options.rowTag) } - val sparkTimezone = sqlContext.sparkContext.getConf.getOption( - "spark.sql.session.timeZone" - ) + val paramsWithTZ = parameters ++ + sqlContext.sparkContext.getConf.getOption("spark.sql.session.timeZone"). + map { tz => "timezone" -> tz } XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), - if (sparkTimezone.isDefined) parameters + ("timezone" -> sparkTimezone.get) - else parameters, + paramsWithTZ, schema)(sqlContext) } diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index e108f555..41503b7b 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -17,10 +17,9 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} -import java.text.{NumberFormat, ParsePosition} -import java.time.{LocalDate, ZoneId, ZonedDateTime} +import java.text.NumberFormat +import java.time.{Instant, LocalDate, ZoneId} import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} -import java.time.temporal.TemporalQueries import java.util.Locale import scala.util.Try @@ -113,20 +112,22 @@ private[xml] object TypeCast { // 2002-05-30T21:46:54+06:00 DateTimeFormatter.ISO_OFFSET_DATE_TIME, // 2002-05-30T21:46:54.1234Z - DateTimeFormatter.ISO_INSTANT.withZone(ZoneId.of("UTC")) + DateTimeFormatter.ISO_INSTANT ) private def parseXmlTimestamp(value: String, options: XmlOptions): Timestamp = { - val formatters = options.timestampFormat.map(DateTimeFormatter.ofPattern). - map(supportedXmlTimestampFormatters :+ _).getOrElse(supportedXmlTimestampFormatters) - formatters.foreach { format => + supportedXmlTimestampFormatters.foreach { format => + try { + return Timestamp.from(Instant.from(format.parse(value))) + } catch { + case _: Exception => // continue + } + } + options.timestampFormat.foreach { formatString => + val format = DateTimeFormatter.ofPattern(formatString). + withZone(options.timezone.map(ZoneId.of).orNull) try { - val extendedFormat = if (isParseableAsZonedDateTime(value, format)) { - format - } else { - format.withZone(ZoneId.of(options.timezone.get)) - } - return Timestamp.from(ZonedDateTime.parse(value, extendedFormat).toInstant) + return Timestamp.from(Instant.from(format.parse(value))) } catch { case _: Exception => // continue } @@ -274,17 +275,4 @@ private[xml] object TypeCast { TypeCast.castTo(data, FloatType, options).asInstanceOf[Float] } } - - private[xml] def isParseableAsZonedDateTime(value: String, - formatter: DateTimeFormatter): Boolean = { - val pos = new ParsePosition(0) - val temporalAccessor = formatter.parseUnresolved(value, pos) - // Checks if there is error in parsing - val parseable = pos.getErrorIndex < 0 && pos.getIndex >= value.length - // Checks if has zone, offset or timezone information - val hasTemporalInformation = (temporalAccessor != null && - temporalAccessor.query(TemporalQueries.zone()) != null) || - formatter.getZone != null - parseable && hasTemporalInformation - } } diff --git a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala index 50ddc66f..2fce0794 100644 --- a/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/util/TypeCastSuite.scala @@ -18,14 +18,12 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} import java.time.{ZoneId, ZonedDateTime} -import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.util.Locale import org.scalatest.funsuite.AnyFunSuite import org.apache.spark.sql.types._ import com.databricks.spark.xml.XmlOptions -import com.databricks.spark.xml.util.TypeCast.isParseableAsZonedDateTime final class TypeCastSuite extends AnyFunSuite { @@ -165,53 +163,72 @@ final class TypeCastSuite extends AnyFunSuite { } } - test("Test if string is parseable as a timestamp") { - val supportedXmlTimestampFormatters = Seq( - // 2002-05-30 21:46:54 - new DateTimeFormatterBuilder() - .parseCaseInsensitive() - .append(DateTimeFormatter.ISO_LOCAL_DATE) - .appendLiteral(' ') - .append(DateTimeFormatter.ISO_LOCAL_TIME) - .toFormatter() - .withZone(ZoneId.of("UTC")), - // 2002-05-30T21:46:54 - DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.of("UTC")), - // 2002-05-30T21:46:54+06:00 - DateTimeFormatter.ISO_OFFSET_DATE_TIME, + test("Parsing built-in timestamp formatters") { + val options = XmlOptions(Map()) + val expectedResult = Timestamp.from( + ZonedDateTime.of(2002, 5, 30, 21, 46, 54, 0, ZoneId.of("UTC")) + .toInstant ) + assert( + TypeCast.castTo("2002-05-30 21:46:54", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54+00:00", TimestampType, options) === expectedResult + ) + assert( + TypeCast.castTo("2002-05-30T21:46:54.0000Z", TimestampType, options) === expectedResult + ) + } - val supportedXmlTimestamps = Seq( - "2002-05-30 21:46:54", - "2002-05-30T21:46:54", - "2002-05-30T21:46:54+06:00" + test("Custom timestamp format is used to parse correctly") { + var options = XmlOptions(Map("timestampFormat" -> "MM-dd-yyyy HH:mm:ss", "timezone" -> "UTC")) + assert( + TypeCast.castTo("12-03-2011 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) ) - val checkBuiltInTimestamps = supportedXmlTimestampFormatters.zip(supportedXmlTimestamps) + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", "timezone" -> "UTC")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) - checkBuiltInTimestamps.foreach { case(format, value) => - assert(isParseableAsZonedDateTime(value, format)) - } + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", + "timezone" -> "Asia/Shanghai")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) !== + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) - assert(isParseableAsZonedDateTime( - "12-03-2011 10:15:30 PST", - DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss z") - )) - assert(isParseableAsZonedDateTime( - "2011/12/03 16:15:30 +1000", - DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss Z") - )) - assert(!isParseableAsZonedDateTime( - "2011/12/03 16:15:30", - DateTimeFormatter.ofPattern("yyyy/MM/dd HH:mm:ss") - )) - assert(!isParseableAsZonedDateTime( - "12-03-2011 10:15:30 PS", - DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss z") - )) - assert(!isParseableAsZonedDateTime( - "12-03-2011 10:15:30 PST", - DateTimeFormatter.ofPattern("MM-dd-yyyy HH:mm:ss") - )) + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss", + "timezone" -> "Asia/Shanghai")) + assert( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("Asia/Shanghai")) + .toInstant + ) + ) + + options = XmlOptions(Map("timestampFormat" -> "yyyy/MM/dd HH:mm:ss")) + intercept[IllegalArgumentException]( + TypeCast.castTo("2011/12/03 10:15:30", TimestampType, options) === + Timestamp.from( + ZonedDateTime.of(2011, 12, 3, 10, 15, 30, 0, ZoneId.of("UTC")) + .toInstant + ) + ) } } From 5728d543e32f56132b733c80343cf2aefcfad630 Mon Sep 17 00:00:00 2001 From: joristruong Date: Sun, 1 Jan 2023 00:44:35 +0800 Subject: [PATCH 12/16] fix: parameters with timezone --- .../scala/com/databricks/spark/xml/DefaultSource.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala index a74345b9..934878ec 100755 --- a/src/main/scala/com/databricks/spark/xml/DefaultSource.scala +++ b/src/main/scala/com/databricks/spark/xml/DefaultSource.scala @@ -67,9 +67,11 @@ class DefaultSource (options.charset, options.rowTag) } - val paramsWithTZ = parameters ++ - sqlContext.sparkContext.getConf.getOption("spark.sql.session.timeZone"). - map { tz => "timezone" -> tz } + val paramsWithTZ = + sqlContext.sparkContext.getConf.getOption("spark.sql.session.timeZone") match { + case Some(tz) => parameters.updated("timezone", tz) + case None => parameters + } XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), From b8d3e4b50d1b637f558f936a894cf7af7d718205 Mon Sep 17 00:00:00 2001 From: joristruong Date: Sun, 1 Jan 2023 18:04:01 +0800 Subject: [PATCH 13/16] fix: apply Spark timeZone only if no temporal information --- .../databricks/spark/xml/util/TypeCast.scala | 20 ++++++++++++++----- .../com/databricks/spark/xml/XmlSuite.scala | 4 ++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 41503b7b..71314705 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -17,17 +17,17 @@ package com.databricks.spark.xml.util import java.math.BigDecimal import java.sql.{Date, Timestamp} -import java.text.NumberFormat +import java.text.{NumberFormat, ParsePosition} import java.time.{Instant, LocalDate, ZoneId} import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder} import java.util.Locale - import scala.util.Try import scala.util.control.Exception._ - import org.apache.spark.sql.types._ import com.databricks.spark.xml.XmlOptions +import java.time.temporal.TemporalQueries + /** * Utility functions for type casting */ @@ -124,8 +124,18 @@ private[xml] object TypeCast { } } options.timestampFormat.foreach { formatString => - val format = DateTimeFormatter.ofPattern(formatString). - withZone(options.timezone.map(ZoneId.of).orNull) + // Check if there is offset or timezone and apply Spark timeZone if not + val hasTemporalInformation = formatString.indexOf("V") + + formatString.indexOf("z") + + formatString.indexOf("O") + + formatString.indexOf("X") + + formatString.indexOf("x") + + formatString.indexOf("Z") == (-6) + val format = if (hasTemporalInformation) { + DateTimeFormatter.ofPattern(formatString) + } else { + DateTimeFormatter.ofPattern(formatString).withZone(options.timezone.map(ZoneId.of).orNull) + } try { return Timestamp.from(Instant.from(format.parse(value))) } catch { diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 571cbc76..8f4c16b9 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -47,7 +47,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { master("local[2]"). appName("XmlSuite"). config("spark.ui.enabled", false). - config("spark.sql.session.timeZone", "UTC"). + //config("spark.sql.session.timeZone", "UTC"). getOrCreate() } private var tempDir: Path = _ @@ -1432,7 +1432,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { test("Test custom timestampFormat with offset") { val df = spark.read .option("rowTag", "book") - .option("timestampFormat", "yyyy/MM/dd HH:mm:ss Z") + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss xx") .xml(resDir + "time.xml") val expectedSchema = buildSchema( From 3789a676e2245d1bfb94f11b78b2b6aa4e267cd2 Mon Sep 17 00:00:00 2001 From: joristruong Date: Sun, 1 Jan 2023 18:12:16 +0800 Subject: [PATCH 14/16] fix: hasTemporalInformation --- src/main/scala/com/databricks/spark/xml/util/TypeCast.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 71314705..8438abd3 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -130,7 +130,7 @@ private[xml] object TypeCast { formatString.indexOf("O") + formatString.indexOf("X") + formatString.indexOf("x") + - formatString.indexOf("Z") == (-6) + formatString.indexOf("Z") != (-6) val format = if (hasTemporalInformation) { DateTimeFormatter.ofPattern(formatString) } else { From a88a20fd181915d27b7f4e559a7e0ccb14409c51 Mon Sep 17 00:00:00 2001 From: joristruong Date: Sun, 1 Jan 2023 18:49:59 +0800 Subject: [PATCH 15/16] fix: spark config --- src/test/scala/com/databricks/spark/xml/XmlSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala index 8f4c16b9..2acfb4c0 100755 --- a/src/test/scala/com/databricks/spark/xml/XmlSuite.scala +++ b/src/test/scala/com/databricks/spark/xml/XmlSuite.scala @@ -47,7 +47,7 @@ final class XmlSuite extends AnyFunSuite with BeforeAndAfterAll { master("local[2]"). appName("XmlSuite"). config("spark.ui.enabled", false). - //config("spark.sql.session.timeZone", "UTC"). + config("spark.sql.session.timeZone", "UTC"). getOrCreate() } private var tempDir: Path = _ From 89486f165008f05db83ceac8600cbd34149d3101 Mon Sep 17 00:00:00 2001 From: JorisTruong Date: Tue, 3 Jan 2023 00:24:55 +0800 Subject: [PATCH 16/16] docs: commented for java 8 and 11 --- src/main/scala/com/databricks/spark/xml/util/TypeCast.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala index 8438abd3..2e81affc 100644 --- a/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala +++ b/src/main/scala/com/databricks/spark/xml/util/TypeCast.scala @@ -125,6 +125,7 @@ private[xml] object TypeCast { } options.timestampFormat.foreach { formatString => // Check if there is offset or timezone and apply Spark timeZone if not + // Useful to support Java 8 and Java 11+ as they prioritize zone and offset differently val hasTemporalInformation = formatString.indexOf("V") + formatString.indexOf("z") + formatString.indexOf("O") +