Skip to content

Commit

Permalink
[SPARK-25708][SQL] HAVING without GROUP BY means global aggregate
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

According to the SQL standard, when a query contains `HAVING`, it indicates an aggregate operator. For more details please refer to https://blog.jooq.org/2014/12/04/do-you-really-understand-sqls-group-by-and-having-clauses/

However, in Spark SQL parser, we treat HAVING as a normal filter when there is no GROUP BY, which breaks SQL semantic and lead to wrong result. This PR fixes the parser.

## How was this patch tested?

new test

Closes #22696 from cloud-fan/having.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: gatorsmile <gatorsmile@gmail.com>
  • Loading branch information
cloud-fan authored and gatorsmile committed Oct 12, 2018
1 parent 3685130 commit 78e1331
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 19 deletions.
1 change: 1 addition & 0 deletions docs/sql-programming-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -1977,6 +1977,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see
- Since Spark 2.4, Metadata files (e.g. Parquet summary files) and temporary files are not counted as data files when calculating table size during Statistics computation.
- Since Spark 2.4, empty strings are saved as quoted empty strings `""`. In version 2.3 and earlier, empty strings are equal to `null` values and do not reflect to any characters in saved CSV files. For example, the row of `"a", null, "", 1` was writted as `a,,,1`. Since Spark 2.4, the same row is saved as `a,,"",1`. To restore the previous behavior, set the CSV option `emptyValue` to empty (not quoted) string.
- Since Spark 2.4, The LOAD DATA command supports wildcard `?` and `*`, which match any one character, and zero or more characters, respectively. Example: `LOAD DATA INPATH '/tmp/folder*/'` or `LOAD DATA INPATH '/tmp/part-?'`. Special Characters like `space` also now work in paths. Example: `LOAD DATA INPATH '/tmp/folder name/'`.
- In Spark version 2.3 and earlier, HAVING without GROUP BY is treated as WHERE. This means, `SELECT 1 FROM range(10) HAVING true` is executed as `SELECT 1 FROM range(10) WHERE true` and returns 10 rows. This violates SQL standard, and has been fixed in Spark 2.4. Since Spark 2.4, HAVING without GROUP BY is treated as a global aggregate, which means `SELECT 1 FROM range(10) HAVING true` will return only one row. To restore the previous behavior, set `spark.sql.legacy.parser.havingWithoutGroupByAsWhere` to `true`.

## Upgrading From Spark SQL 2.3.0 to 2.3.1 and above

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,17 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
Filter(expression(ctx), plan)
}

def withHaving(ctx: BooleanExpressionContext, plan: LogicalPlan): LogicalPlan = {
// Note that we add a cast to non-predicate expressions. If the expression itself is
// already boolean, the optimizer will get rid of the unnecessary cast.
val predicate = expression(ctx) match {
case p: Predicate => p
case e => Cast(e, BooleanType)
}
Filter(predicate, plan)
}


// Expressions.
val expressions = Option(namedExpressionSeq).toSeq
.flatMap(_.namedExpression.asScala)
Expand Down Expand Up @@ -446,30 +457,34 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
case e: NamedExpression => e
case e: Expression => UnresolvedAlias(e)
}
val withProject = if (aggregation != null) {
withAggregation(aggregation, namedExpressions, withFilter)
} else if (namedExpressions.nonEmpty) {

def createProject() = if (namedExpressions.nonEmpty) {
Project(namedExpressions, withFilter)
} else {
withFilter
}

// Having
val withHaving = withProject.optional(having) {
// Note that we add a cast to non-predicate expressions. If the expression itself is
// already boolean, the optimizer will get rid of the unnecessary cast.
val predicate = expression(having) match {
case p: Predicate => p
case e => Cast(e, BooleanType)
val withProject = if (aggregation == null && having != null) {
if (conf.getConf(SQLConf.LEGACY_HAVING_WITHOUT_GROUP_BY_AS_WHERE)) {
// If the legacy conf is set, treat HAVING without GROUP BY as WHERE.
withHaving(having, createProject())
} else {
// According to SQL standard, HAVING without GROUP BY means global aggregate.
withHaving(having, Aggregate(Nil, namedExpressions, withFilter))
}
Filter(predicate, withProject)
} else if (aggregation != null) {
val aggregate = withAggregation(aggregation, namedExpressions, withFilter)
aggregate.optionalMap(having)(withHaving)
} else {
// When hitting this branch, `having` must be null.
createProject()
}

// Distinct
val withDistinct = if (setQuantifier() != null && setQuantifier().DISTINCT() != null) {
Distinct(withHaving)
Distinct(withProject)
} else {
withHaving
withProject
}

// Window
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,14 @@ object SQLConf {
.internal()
.booleanConf
.createWithDefault(false)

val LEGACY_HAVING_WITHOUT_GROUP_BY_AS_WHERE =
buildConf("spark.sql.legacy.parser.havingWithoutGroupByAsWhere")
.internal()
.doc("If it is set to true, the parser will treat HAVING without GROUP BY as a normal " +
"WHERE, which does not follow SQL standard.")
.booleanConf
.createWithDefault(false)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class PlanParserSuite extends AnalysisTest {
assertEqual("select a, b from db.c where x < 1", table("db", "c").where('x < 1).select('a, 'b))
assertEqual(
"select a, b from db.c having x < 1",
table("db", "c").select('a, 'b).where('x < 1))
table("db", "c").groupBy()('a, 'b).where('x < 1))
assertEqual("select distinct a, b from db.c", Distinct(table("db", "c").select('a, 'b)))
assertEqual("select all a, b from db.c", table("db", "c").select('a, 'b))
assertEqual("select from tbl", OneRowRelation().select('from.as("tbl")))
Expand Down
7 changes: 7 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/group-by.sql
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,10 @@ where b.z != b.z;
-- SPARK-24369 multiple distinct aggregations having the same argument set
SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y);

-- SPARK-25708 HAVING without GROUP BY means global aggregate
SELECT 1 FROM range(10) HAVING true;

SELECT 1 FROM range(10) HAVING MAX(id) > 0;

SELECT id FROM range(10) HAVING id > 0;
27 changes: 26 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/group-by.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 27
-- Number of queries: 30


-- !query 0
Expand Down Expand Up @@ -250,3 +250,28 @@ SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*)
struct<corr(DISTINCT CAST(x AS DOUBLE), CAST(y AS DOUBLE)):double,corr(DISTINCT CAST(y AS DOUBLE), CAST(x AS DOUBLE)):double,count(1):bigint>
-- !query 26 output
1.0 1.0 3


-- !query 27
SELECT 1 FROM range(10) HAVING true
-- !query 27 schema
struct<1:int>
-- !query 27 output
1


-- !query 28
SELECT 1 FROM range(10) HAVING MAX(id) > 0
-- !query 28 schema
struct<1:int>
-- !query 28 output
1


-- !query 29
SELECT id FROM range(10) HAVING id > 0
-- !query 29 schema
struct<>
-- !query 29 output
org.apache.spark.sql.AnalysisException
grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.;
Original file line number Diff line number Diff line change
Expand Up @@ -740,10 +740,6 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd
sql("select key, count(*) c from src group by key having c").collect()
}

test("SPARK-2225: turn HAVING without GROUP BY into a simple filter") {
assert(sql("select key from src having key > 490").collect().size < 100)
}

test("union/except/intersect") {
assertResult(Array(Row(1), Row(1))) {
sql("select 1 as a union all select 1 as a").collect()
Expand Down

0 comments on commit 78e1331

Please sign in to comment.