From 2cdbe7214ce1b04f7639b814362c81765dcd63ff Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Thu, 11 Jun 2020 06:39:14 +0000 Subject: [PATCH] [SPARK-31958][SQL] normalize special floating numbers in subquery ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/23388 . https://github.com/apache/spark/pull/23388 has an issue: it doesn't handle subquery expressions and assumes they will be turned into joins. However, this is not true for non-correlated subquery expressions. This PR fixes this issue. It now doesn't skip `Subquery`, and subquery expressions will be handled by `OptimizeSubqueries`, which runs the optimizer with the subquery. Note that, correlated subquery expressions will be handled twice: once in `OptimizeSubqueries`, once later when it becomes join. This is OK as `NormalizeFloatingNumbers` is idempotent now. ### Why are the changes needed? fix a bug ### Does this PR introduce _any_ user-facing change? yes, see the newly added test. ### How was this patch tested? new test Closes #28785 from cloud-fan/normalize. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan (cherry picked from commit 6fb9c80da129d0b43f9ff5b8be6ce8bad992a4ed) Signed-off-by: Wenchen Fan --- .../optimizer/NormalizeFloatingNumbers.scala | 4 ---- .../org/apache/spark/sql/SQLQuerySuite.scala | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala index 5f94af5ffe636..43738204c6704 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala @@ -56,10 +56,6 @@ import org.apache.spark.sql.types._ object NormalizeFloatingNumbers extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan match { - // A subquery will be rewritten into join later, and will go through this rule - // eventually. Here we skip subquery, as we only need to run this rule once. - case _: Subquery => plan - case _ => plan transform { case w: Window if w.partitionSpec.exists(p => needNormalize(p)) => // Although the `windowExpressions` may refer to `partitionSpec` expressions, we don't need diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index a23e5831f5887..093f2dbd1e426 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -3449,6 +3449,24 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark checkAnswer(sql("select CAST(-32768 as short) DIV CAST (-1 as short)"), Seq(Row(Short.MinValue.toLong * -1))) } + + test("normalize special floating numbers in subquery") { + withTempView("v1", "v2", "v3") { + Seq(-0.0).toDF("d").createTempView("v1") + Seq(0.0).toDF("d").createTempView("v2") + spark.range(2).createTempView("v3") + + // non-correlated subquery + checkAnswer(sql("SELECT (SELECT v1.d FROM v1 JOIN v2 ON v1.d = v2.d)"), Row(-0.0)) + // correlated subquery + checkAnswer( + sql( + """ + |SELECT id FROM v3 WHERE EXISTS + | (SELECT v1.d FROM v1 JOIN v2 ON v1.d = v2.d WHERE id > 0) + |""".stripMargin), Row(1)) + } + } } case class Foo(bar: Option[String])