Skip to content

Commit

Permalink
[SPARK-21263][SQL] Do not allow partially parsing double and floats v…
Browse files Browse the repository at this point in the history
…ia NumberFormat in CSV

## What changes were proposed in this pull request?

This PR proposes to remove `NumberFormat.parse` use to disallow a case of partially parsed data. For example,

```
scala> spark.read.schema("a DOUBLE").option("mode", "FAILFAST").csv(Seq("10u12").toDS).show()
+----+
|   a|
+----+
|10.0|
+----+
```

## How was this patch tested?

Unit tests added in `UnivocityParserSuite` and `CSVSuite`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #18532 from HyukjinKwon/SPARK-21263.
  • Loading branch information
HyukjinKwon authored and srowen committed Jul 11, 2017
1 parent a4baa8f commit 7514db1
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -111,19 +111,15 @@ class UnivocityParser(
case options.nanValue => Float.NaN
case options.negativeInf => Float.NegativeInfinity
case options.positiveInf => Float.PositiveInfinity
case datum =>
Try(datum.toFloat)
.getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).floatValue())
case datum => datum.toFloat
}

case _: DoubleType => (d: String) =>
nullSafeDatum(d, name, nullable, options) {
case options.nanValue => Double.NaN
case options.negativeInf => Double.NegativeInfinity
case options.positiveInf => Double.PositiveInfinity
case datum =>
Try(datum.toDouble)
.getOrElse(NumberFormat.getInstance(Locale.US).parse(datum).doubleValue())
case datum => datum.toDouble
}

case _: BooleanType => (d: String) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1174,4 +1174,25 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
}
}

test("SPARK-21263: Invalid float and double are handled correctly in different modes") {
val exception = intercept[SparkException] {
spark.read.schema("a DOUBLE")
.option("mode", "FAILFAST")
.csv(Seq("10u12").toDS())
.collect()
}
assert(exception.getMessage.contains("""input string: "10u12""""))

val count = spark.read.schema("a FLOAT")
.option("mode", "DROPMALFORMED")
.csv(Seq("10u12").toDS())
.count()
assert(count == 0)

val results = spark.read.schema("a FLOAT")
.option("mode", "PERMISSIVE")
.csv(Seq("10u12").toDS())
checkAnswer(results, Row(null))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,17 @@ class UnivocityParserSuite extends SparkFunSuite {
DateTimeUtils.millisToDays(DateTimeUtils.stringToTime("2015-01-01").getTime))
}

test("Float and Double Types are cast without respect to platform default Locale") {
val originalLocale = Locale.getDefault
try {
Locale.setDefault(new Locale("fr", "FR"))
// Would parse as 1.0 in fr-FR
val options = new CSVOptions(Map.empty[String, String], "GMT")
assert(parser.makeConverter("_1", FloatType, options = options).apply("1,00") == 100.0)
assert(parser.makeConverter("_1", DoubleType, options = options).apply("1,00") == 100.0)
} finally {
Locale.setDefault(originalLocale)
test("Throws exception for casting an invalid string to Float and Double Types") {
val options = new CSVOptions(Map.empty[String, String], "GMT")
val types = Seq(DoubleType, FloatType)
val input = Seq("10u000", "abc", "1 2/3")
types.foreach { dt =>
input.foreach { v =>
val message = intercept[NumberFormatException] {
parser.makeConverter("_1", dt, options = options).apply(v)
}.getMessage
assert(message.contains(v))
}
}
}

Expand Down

0 comments on commit 7514db1

Please sign in to comment.