-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-27893][SQL][PYTHON][FOLLOW-UP] Allow Scalar Pandas and Python UDFs can be tested with Scala test base #24945
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,32 +40,37 @@ import org.apache.spark.sql.types.StringType | |
* | ||
* To register Scala UDF in SQL: | ||
* {{{ | ||
* registerTestUDF(TestScalaUDF(name = "udf_name"), spark) | ||
* val scalaTestUDF = TestScalaUDF(name = "udf_name") | ||
* registerTestUDF(scalaTestUDF, spark) | ||
* }}} | ||
* | ||
* To register Python UDF in SQL: | ||
* {{{ | ||
* registerTestUDF(TestPythonUDF(name = "udf_name"), spark) | ||
* val pythonTestUDF = TestPythonUDF(name = "udf_name") | ||
* registerTestUDF(pythonTestUDF, spark) | ||
* }}} | ||
* | ||
* To register Scalar Pandas UDF in SQL: | ||
* {{{ | ||
* registerTestUDF(TestScalarPandasUDF(name = "udf_name"), spark) | ||
* val pandasTestUDF = TestScalarPandasUDF(name = "udf_name") | ||
* registerTestUDF(pandasTestUDF, spark) | ||
* }}} | ||
* | ||
* To use it in Scala API and SQL: | ||
* {{{ | ||
* sql("SELECT udf_name(1)") | ||
* spark.select(expr("udf_name(1)") | ||
* spark.range(10).select(expr("udf_name(id)") | ||
* spark.range(10).select(pandasTestUDF($"id")) | ||
* }}} | ||
*/ | ||
object IntegratedUDFTestUtils extends SQLHelper { | ||
import scala.sys.process._ | ||
|
||
private lazy val pythonPath = sys.env.getOrElse("PYTHONPATH", "") | ||
private lazy val sparkHome = if (sys.props.contains(Tests.IS_TESTING.key)) { | ||
assert(sys.props.contains("spark.test.home"), "spark.test.home is not set.") | ||
sys.props("spark.test.home") | ||
assert(sys.props.contains("spark.test.home") || | ||
sys.env.contains("SPARK_HOME"), "spark.test.home or SPARK_HOME is not set.") | ||
sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is for IDE case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a comment for this reason? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, I missed this. Actually there are multiple places like this. Let me fix them together later separately. |
||
} else { | ||
assert(sys.env.contains("SPARK_HOME"), "SPARK_HOME is not set.") | ||
sys.env("SPARK_HOME") | ||
|
@@ -186,14 +191,18 @@ object IntegratedUDFTestUtils extends SQLHelper { | |
/** | ||
* A base trait for various UDFs defined in this object. | ||
*/ | ||
sealed trait TestUDF | ||
sealed trait TestUDF { | ||
def apply(exprs: Column*): Column | ||
|
||
val prettyName: String | ||
} | ||
|
||
/** | ||
* A Python UDF that takes one column and returns a string column. | ||
* Equivalent to `udf(lambda x: str(x), "string")` | ||
*/ | ||
case class TestPythonUDF(name: String) extends TestUDF { | ||
lazy val udf = UserDefinedPythonFunction( | ||
private[IntegratedUDFTestUtils] lazy val udf = UserDefinedPythonFunction( | ||
name = name, | ||
func = PythonFunction( | ||
command = pythonFunc, | ||
|
@@ -206,14 +215,18 @@ object IntegratedUDFTestUtils extends SQLHelper { | |
dataType = StringType, | ||
pythonEvalType = PythonEvalType.SQL_BATCHED_UDF, | ||
udfDeterministic = true) | ||
|
||
def apply(exprs: Column*): Column = udf(exprs: _*) | ||
|
||
val prettyName: String = "Regular Python UDF" | ||
} | ||
|
||
/** | ||
* A Scalar Pandas UDF that takes one column and returns a string column. | ||
* Equivalent to `pandas_udf(lambda x: x.apply(str), "string", PandasUDFType.SCALAR)`. | ||
*/ | ||
case class TestScalarPandasUDF(name: String) extends TestUDF { | ||
lazy val udf = UserDefinedPythonFunction( | ||
private[IntegratedUDFTestUtils] lazy val udf = UserDefinedPythonFunction( | ||
name = name, | ||
func = PythonFunction( | ||
command = pandasFunc, | ||
|
@@ -226,17 +239,25 @@ object IntegratedUDFTestUtils extends SQLHelper { | |
dataType = StringType, | ||
pythonEvalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF, | ||
udfDeterministic = true) | ||
|
||
def apply(exprs: Column*): Column = udf(exprs: _*) | ||
|
||
val prettyName: String = "Scalar Pandas UDF" | ||
} | ||
|
||
/** | ||
* A Scala UDF that takes one column and returns a string column. | ||
* Equivalent to `udf((input: Any) => input.toString)`. | ||
*/ | ||
case class TestScalaUDF(name: String) extends TestUDF { | ||
lazy val udf = SparkUserDefinedFunction( | ||
private[IntegratedUDFTestUtils] lazy val udf = SparkUserDefinedFunction( | ||
(input: Any) => input.toString, | ||
StringType, | ||
inputSchemas = Seq.fill(1)(None)) | ||
|
||
def apply(exprs: Column*): Column = udf(exprs: _*) | ||
|
||
val prettyName: String = "Scala UDF" | ||
} | ||
|
||
/** | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will we use it? In
SQLQueryTestSuite
, I think udfs are all registered forUDFTestCase
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah this one will be used at #24946