Skip to content

Commit

Permalink
[SPARK-40755][SQL] Migrate type check failures of number formatting o…
Browse files Browse the repository at this point in the history
…nto error classes

### What changes were proposed in this pull request?
This pr aims to replaces TypeCheckFailure by DataTypeMismatch in type checks in the number formatting or parsing expressions, includes:
1. ToNumber (1):
https://github.com/apache/spark/blob/1431975723d8df30a25b2333eddcfd0bb6c57677/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala#L83
2. ToCharacter (1):
https://github.com/apache/spark/blob/1431975723d8df30a25b2333eddcfd0bb6c57677/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/numberFormatExpressions.scala#L227
3. ToNumberParser (1):
https://github.com/apache/spark/blob/5556cfc59aa97a3ad4ea0baacebe19859ec0bcb7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ToNumberParser.scala#L262

### Why are the changes needed?
Migration onto error classes unifies Spark SQL error messages.

### Does this PR introduce _any_ user-facing change?
Yes. The PR changes user-facing error messages.

### How was this patch tested?
1. Add new UT
2. Update existed UT
3. Pass GA

Closes #38531 from panbingkun/SPARK-40755.

Authored-by: panbingkun <pbk1982@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
panbingkun authored and MaxGekk committed Nov 15, 2022
1 parent f3400e4 commit b8b90ad
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 87 deletions.
40 changes: 40 additions & 0 deletions core/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,46 @@
"Input to <functionName> should all be the same type, but it's <dataType>."
]
},
"FORMAT_CONT_THOUSANDS_SEPS" : {
"message" : [
"Thousands separators (, or G) must have digits in between them in the number format: <format>."
]
},
"FORMAT_CUR_MUST_BEFORE_DEC" : {
"message" : [
"Currency characters must appear before any decimal point in the number format: <format>."
]
},
"FORMAT_CUR_MUST_BEFORE_DIGIT" : {
"message" : [
"Currency characters must appear before digits in the number format: <format>."
]
},
"FORMAT_EMPTY" : {
"message" : [
"The number format string cannot be empty."
]
},
"FORMAT_THOUSANDS_SEPS_MUST_BEFORE_DEC" : {
"message" : [
"Thousands separators (, or G) may not appear after the decimal point in the number format: <format>."
]
},
"FORMAT_UNEXPECTED_TOKEN" : {
"message" : [
"Unexpected <token> found in the format string <format>; the structure of the format string must match: [MI|S] [$] [0|9|G|,]* [.|D] [0|9]* [$] [PR|MI|S]."
]
},
"FORMAT_WRONG_NUM_DIGIT" : {
"message" : [
"The format string requires at least one number digit."
]
},
"FORMAT_WRONG_NUM_TOKEN" : {
"message" : [
"At most one <token> is allowed in the number format: <format>."
]
},
"HASH_MAP_TYPE" : {
"message" : [
"Input to the function <functionName> cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
import java.util.Locale

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.expressions.Cast._
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
import org.apache.spark.sql.catalyst.util.ToNumberParser
Expand Down Expand Up @@ -78,9 +80,16 @@ case class ToNumber(left: Expression, right: Expression)
val inputTypeCheck = super.checkInputDataTypes()
if (inputTypeCheck.isSuccess) {
if (right.foldable) {
numberFormatter.check()
numberFormatter.checkInputDataTypes()
} else {
TypeCheckResult.TypeCheckFailure(s"Format expression must be foldable, but got $right")
DataTypeMismatch(
errorSubClass = "NON_FOLDABLE_INPUT",
messageParameters = Map(
"inputName" -> toSQLId(right.prettyName),
"inputType" -> toSQLType(right.dataType),
"inputExpr" -> toSQLExpr(right)
)
)
}
} else {
inputTypeCheck
Expand Down Expand Up @@ -222,9 +231,16 @@ case class ToCharacter(left: Expression, right: Expression)
val inputTypeCheck = super.checkInputDataTypes()
if (inputTypeCheck.isSuccess) {
if (right.foldable) {
numberFormatter.check()
numberFormatter.checkInputDataTypes()
} else {
TypeCheckResult.TypeCheckFailure(s"Format expression must be foldable, but got $right")
DataTypeMismatch(
errorSubClass = "NON_FOLDABLE_INPUT",
messageParameters = Map(
"inputName" -> toSQLId(right.prettyName),
"inputType" -> toSQLType(right.dataType),
"inputExpr" -> toSQLExpr(right)
)
)
}
} else {
inputTypeCheck
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ package org.apache.spark.sql.catalyst.util
import scala.collection.mutable

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{DataTypeMismatch, TypeCheckSuccess}
import org.apache.spark.sql.catalyst.expressions.Cast._
import org.apache.spark.sql.errors.QueryExecutionErrors
import org.apache.spark.sql.types.{Decimal, DecimalType}
import org.apache.spark.sql.types.{Decimal, DecimalType, StringType}
import org.apache.spark.unsafe.types.UTF8String

// This object contains some definitions of characters and tokens for the parser below.
Expand Down Expand Up @@ -253,22 +255,7 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
*/
def parsedDecimalType: DecimalType = DecimalType(precision, scale)

/**
* Consumes the format string to check validity and computes an appropriate Decimal output type.
*/
def check(): TypeCheckResult = {
val validateResult: String = validateFormatString
if (validateResult.nonEmpty) {
TypeCheckResult.TypeCheckFailure(validateResult)
} else {
TypeCheckResult.TypeCheckSuccess
}
}

/**
* This implementation of the [[check]] method returns any error, or the empty string on success.
*/
private def validateFormatString: String = {
def checkInputDataTypes(): TypeCheckResult = {
val firstDollarSignIndex: Int = formatTokens.indexOf(DollarSign())
val firstDigitIndex: Int = formatTokens.indexWhere {
case _: DigitGroups => true
Expand All @@ -294,22 +281,37 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali

// Make sure the format string contains at least one token.
if (numberFormat.isEmpty) {
return "The format string cannot be empty"
return DataTypeMismatch(
errorSubClass = "FORMAT_EMPTY",
messageParameters = Map.empty
)
}
// Make sure the format string contains at least one digit.
if (!formatTokens.exists(
token => token.isInstanceOf[DigitGroups])) {
return "The format string requires at least one number digit"
return DataTypeMismatch(
errorSubClass = "FORMAT_WRONG_NUM_DIGIT",
messageParameters = Map.empty
)
}
// Make sure that any dollar sign in the format string occurs before any digits.
if (firstDigitIndex < firstDollarSignIndex) {
return s"Currency characters must appear before digits in the number format: '$numberFormat'"
return DataTypeMismatch(
errorSubClass = "FORMAT_CUR_MUST_BEFORE_DIGIT",
messageParameters = Map(
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
// Make sure that any dollar sign in the format string occurs before any decimal point.
if (firstDecimalPointIndex != -1 &&
firstDecimalPointIndex < firstDollarSignIndex) {
return "Currency characters must appear before any decimal point in the " +
s"number format: '$numberFormat'"
return DataTypeMismatch(
errorSubClass = "FORMAT_CUR_MUST_BEFORE_DEC",
messageParameters = Map(
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
// Make sure that any thousands separators in the format string have digits before and after.
if (digitGroupsBeforeDecimalPoint.exists {
Expand All @@ -325,16 +327,24 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
false
})
}) {
return "Thousands separators (, or G) must have digits in between them " +
s"in the number format: '$numberFormat'"
return DataTypeMismatch(
errorSubClass = "FORMAT_CONT_THOUSANDS_SEPS",
messageParameters = Map(
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
// Make sure that thousands separators does not appear after the decimal point, if any.
if (digitGroupsAfterDecimalPoint.exists {
case DigitGroups(tokens, digits) =>
tokens.length > digits.length
}) {
return "Thousands separators (, or G) may not appear after the decimal point " +
s"in the number format: '$numberFormat'"
return DataTypeMismatch(
errorSubClass = "FORMAT_THOUSANDS_SEPS_MUST_BEFORE_DEC",
messageParameters = Map(
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
// Make sure that the format string does not contain any prohibited duplicate tokens.
val inputTokenCounts = formatTokens.groupBy(identity).mapValues(_.size)
Expand All @@ -344,7 +354,13 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
DollarSign(),
ClosingAngleBracket()).foreach {
token => if (inputTokenCounts.getOrElse(token, 0) > 1) {
return s"At most one ${token.toString} is allowed in the number format: '$numberFormat'"
return DataTypeMismatch(
errorSubClass = "FORMAT_WRONG_NUM_TOKEN",
messageParameters = Map(
"token" -> token.toString,
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
}
// Enforce the ordering of tokens in the format string according to this specification:
Expand Down Expand Up @@ -377,12 +393,16 @@ class ToNumberParser(numberFormat: String, errorOnFail: Boolean) extends Seriali
}
}
if (formatTokenIndex < formatTokens.length) {
return s"Unexpected ${formatTokens(formatTokenIndex).toString} found in the format string " +
s"'$numberFormat'; the structure of the format string must match: " +
"[MI|S] [$] [0|9|G|,]* [.|D] [0|9]* [$] [PR|MI|S]"
return DataTypeMismatch(
errorSubClass = "FORMAT_UNEXPECTED_TOKEN",
messageParameters = Map(
"token" -> formatTokens(formatTokenIndex).toString,
"format" -> toSQLValue(numberFormat, StringType)
)
)
}
// Validation of the format string finished successfully.
""
TypeCheckSuccess
}

/**
Expand Down
Loading

0 comments on commit b8b90ad

Please sign in to comment.