Skip to content

Commit

Permalink
[SPARK-16730][SQL] Implement function aliases for type casts
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?
Spark 1.x supports using the Hive type name as function names for doing casts, e.g.
```sql
SELECT int(1.0);
SELECT string(2.0);
```

The above query would work in Spark 1.x because Spark 1.x fail back to Hive for unimplemented functions, and break in Spark 2.0 because the fall back was removed.

This patch implements function aliases using an analyzer rule for the following cast functions:
- boolean
- tinyint
- smallint
- int
- bigint
- float
- double
- decimal
- date
- timestamp
- binary
- string

## How was this patch tested?
Added end-to-end tests in SQLCompatibilityFunctionSuite.

Author: petermaxlee <petermaxlee@gmail.com>

Closes #14364 from petermaxlee/SPARK-16730-2.
  • Loading branch information
petermaxlee authored and cloud-fan committed Jul 28, 2016
1 parent b14d7b5 commit 11d427c
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.expressions.xml._
import org.apache.spark.sql.catalyst.util.StringKeyHashMap
import org.apache.spark.sql.types._


/**
Expand Down Expand Up @@ -408,8 +409,21 @@ object FunctionRegistry {
expression[BitwiseAnd]("&"),
expression[BitwiseNot]("~"),
expression[BitwiseOr]("|"),
expression[BitwiseXor]("^")

expression[BitwiseXor]("^"),

// Cast aliases (SPARK-16730)
castAlias("boolean", BooleanType),
castAlias("tinyint", ByteType),
castAlias("smallint", ShortType),
castAlias("int", IntegerType),
castAlias("bigint", LongType),
castAlias("float", FloatType),
castAlias("double", DoubleType),
castAlias("decimal", DecimalType.USER_DEFAULT),
castAlias("date", DateType),
castAlias("timestamp", TimestampType),
castAlias("binary", BinaryType),
castAlias("string", StringType)
)

val builtin: SimpleFunctionRegistry = {
Expand Down Expand Up @@ -452,14 +466,37 @@ object FunctionRegistry {
}
}

val clazz = tag.runtimeClass
(name, (expressionInfo[T](name), builder))
}

/**
* Creates a function registry lookup entry for cast aliases (SPARK-16730).
* For example, if name is "int", and dataType is IntegerType, this means int(x) would become
* an alias for cast(x as IntegerType).
* See usage above.
*/
private def castAlias(
name: String,
dataType: DataType): (String, (ExpressionInfo, FunctionBuilder)) = {
val builder = (args: Seq[Expression]) => {
if (args.size != 1) {
throw new AnalysisException(s"Function $name accepts only one argument")
}
Cast(args.head, dataType)
}
(name, (expressionInfo[Cast](name), builder))
}

/**
* Creates an [[ExpressionInfo]] for the function as defined by expression T using the given name.
*/
private def expressionInfo[T <: Expression : ClassTag](name: String): ExpressionInfo = {
val clazz = scala.reflect.classTag[T].runtimeClass
val df = clazz.getAnnotation(classOf[ExpressionDescription])
if (df != null) {
(name,
(new ExpressionInfo(clazz.getCanonicalName, name, df.usage(), df.extended()),
builder))
new ExpressionInfo(clazz.getCanonicalName, name, df.usage(), df.extended())
} else {
(name, (new ExpressionInfo(clazz.getCanonicalName, name), builder))
new ExpressionInfo(clazz.getCanonicalName, name)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ object Cast {
}

/** Cast the child expression to the target data type. */
@ExpressionDescription(
usage = " - Cast value v to the target data type.",
extended = "> SELECT _FUNC_('10' as int);\n 10")
case class Cast(child: Expression, dataType: DataType) extends UnaryExpression with NullIntolerant {

override def toString: String = s"cast($child as ${dataType.simpleString})"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@

package org.apache.spark.sql

import java.math.BigDecimal
import java.sql.Timestamp

import org.apache.spark.sql.test.SharedSQLContext

/**
* A test suite for functions added for compatibility with other databases such as Oracle, MSSQL.
*
* These functions are typically implemented using the trait
* [[org.apache.spark.sql.catalyst.expressions.RuntimeReplaceable]].
*/
Expand Down Expand Up @@ -69,4 +73,26 @@ class SQLCompatibilityFunctionSuite extends QueryTest with SharedSQLContext {
sql("SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d)"),
Row(2.1, 1.0))
}

test("SPARK-16730 cast alias functions for Hive compatibility") {
checkAnswer(
sql("SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1)"),
Row(true, 1.toByte, 1.toShort, 1, 1L))

checkAnswer(
sql("SELECT float(1), double(1), decimal(1)"),
Row(1.toFloat, 1.0, new BigDecimal(1)))

checkAnswer(
sql("SELECT date(\"2014-04-04\"), timestamp(date(\"2014-04-04\"))"),
Row(new java.util.Date(114, 3, 4), new Timestamp(114, 3, 4, 0, 0, 0, 0)))

checkAnswer(
sql("SELECT string(1)"),
Row("1"))

// Error handling: only one argument
val errorMsg = intercept[AnalysisException](sql("SELECT string(1, 2)")).getMessage
assert(errorMsg.contains("Function string accepts only one argument"))
}
}

0 comments on commit 11d427c

Please sign in to comment.