diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 01bd3edd290c6..3fd5039a4f116 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1814,9 +1814,15 @@ class Analyzer( } // We get an aggregate function, we need to wrap it in an AggregateExpression. case agg: AggregateFunction => - if (filter.isDefined && !filter.get.deterministic) { - failAnalysis("FILTER expression is non-deterministic, " + - "it cannot be used in aggregate functions") + // TODO: SPARK-30276 Support Filter expression allows simultaneous use of DISTINCT + if (filter.isDefined) { + if (isDistinct) { + failAnalysis("DISTINCT and FILTER cannot be used in aggregate functions " + + "at the same time") + } else if (!filter.get.deterministic) { + failAnalysis("FILTER expression is non-deterministic, " + + "it cannot be used in aggregate functions") + } } AggregateExpression(agg, Complete, isDistinct, filter) // This function is not an aggregate function, just return the resolved one. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala index dc05bb363e356..e5571069a7c41 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteDistinctAggregates.scala @@ -31,10 +31,10 @@ import org.apache.spark.sql.types.IntegerType * First example: query without filter clauses (in scala): * {{{ * val data = Seq( - * (1, "a", "ca1", "cb1", 10), - * (2, "a", "ca1", "cb2", 5), - * (3, "b", "ca1", "cb1", 13)) - * .toDF("id", "key", "cat1", "cat2", "value") + * ("a", "ca1", "cb1", 10), + * ("a", "ca1", "cb2", 5), + * ("b", "ca1", "cb1", 13)) + * .toDF("key", "cat1", "cat2", "value") * data.createOrReplaceTempView("data") * * val agg = data.groupBy($"key") @@ -118,66 +118,7 @@ import org.apache.spark.sql.types.IntegerType * LocalTableScan [...] * }}} * - * Third example: single distinct aggregate function with filter clauses (in sql): - * {{{ - * SELECT - * COUNT(DISTINCT cat1) FILTER (WHERE id > 1) as cat1_cnt1, - * COUNT(DISTINCT cat1) as cat1_cnt2, - * SUM(value) AS total - * FROM - * data - * GROUP BY - * key - * }}} - * - * This translates to the following (pseudo) logical plan: - * {{{ - * Aggregate( - * key = ['key] - * functions = [COUNT(DISTINCT 'cat1) with FILTER('id > 1), - * COUNT(DISTINCT 'cat1), - * sum('value)] - * output = ['key, 'cat1_cnt1, 'cat1_cnt2, 'total]) - * LocalTableScan [...] - * }}} - * - * This rule rewrites this logical plan to the following (pseudo) logical plan: - * {{{ - * Aggregate( - * key = ['key] - * functions = [count(if (('gid = 1)) '_gen_distinct_1 else null), - * count(if (('gid = 2)) '_gen_distinct_2 else null), - * first(if (('gid = 0)) 'total else null) ignore nulls] - * output = ['key, 'cat1_cnt1, 'cat1_cnt2, 'total]) - * Aggregate( - * key = ['key, '_gen_distinct_1, '_gen_distinct_2, 'gid] - * functions = [sum('value)] - * output = ['key, '_gen_distinct_1, '_gen_distinct_2, 'gid, 'total]) - * Expand( - * projections = [('key, null, null, 0, 'value), - * ('key, '_gen_distinct_1, null, 1, null), - * ('key, null, '_gen_distinct_2, 2, null)] - * output = ['key, '_gen_distinct_1, '_gen_distinct_2, 'gid, 'value]) - * Expand( - * projections = [('key, if ('id > 1) 'cat1 else null, 'cat1, cast('value as bigint))] - * output = ['key, '_gen_distinct_1, '_gen_distinct_2, 'value]) - * LocalTableScan [...] - * }}} - * - * The rule serves two purposes: - * 1. Expand distinct aggregates which exists filter clause. - * 2. Rewrite when aggregate exists at least two distinct aggregates. - * - * The first child rule does the following things here: - * 1. Guaranteed to compute filter clause locally. - * 2. The attributes referenced by different distinct aggregate expressions are likely to overlap, - * and if no additional processing is performed, data loss will occur. To prevent this, we - * generate new attributes and replace the original ones. - * 3. If we apply the first rule to distinct aggregate expressions which exists filter - * clause, the aggregate after expand may have at least two distinct aggregates, so we need to - * apply the second rule too. - * - * The second child rule does the following things here: + * The rule does the following things here: * 1. Expand the data. There are three aggregation groups in this query: * i. the non-distinct group; * ii. the distinct 'cat1 group; @@ -207,106 +148,24 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { val distinctAggs = exprs.flatMap { _.collect { case ae: AggregateExpression if ae.isDistinct => ae }} - // This rule serves two purposes: - // One is to rewrite when there exists at least two distinct aggregates. We need at least - // two distinct aggregates for this rule because aggregation strategy can handle a single - // distinct group. - // Another is to expand distinct aggregates which exists filter clause so that we can - // evaluate the filter locally. + // We need at least two distinct aggregates for this rule because aggregation + // strategy can handle a single distinct group. // This check can produce false-positives, e.g., SUM(DISTINCT a) & COUNT(DISTINCT a). - distinctAggs.size >= 1 || distinctAggs.exists(_.filter.isDefined) + distinctAggs.size > 1 } def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { - case a: Aggregate if mayNeedtoRewrite(a.aggregateExpressions) => - val expandAggregate = extractFiltersInDistinctAggregate(a) - rewriteDistinctAggregate(expandAggregate) + case a: Aggregate if mayNeedtoRewrite(a.aggregateExpressions) => rewrite(a) } - private def extractFiltersInDistinctAggregate(a: Aggregate): Aggregate = { - val aggExpressions = collectAggregateExprs(a) - val (distinctAggExpressions, regularAggExpressions) = aggExpressions.partition(_.isDistinct) - if (distinctAggExpressions.exists(_.filter.isDefined)) { - // Setup expand for the 'regular' aggregate expressions. Because we will construct a new - // aggregate, the children of the distinct aggregates will be changed to the generate - // ones, so we need creates new references to avoid collisions between distinct and - // regular aggregate children. - val regularAggExprs = regularAggExpressions.filter(_.children.exists(!_.foldable)) - val regularFunChildren = regularAggExprs - .flatMap(_.aggregateFunction.children.filter(!_.foldable)) - val regularFilterAttrs = regularAggExprs.flatMap(_.filterAttributes) - val regularAggChildren = (regularFunChildren ++ regularFilterAttrs).distinct - val regularAggChildAttrMap = regularAggChildren.map(expressionAttributePair) - val regularAggChildAttrLookup = regularAggChildAttrMap.toMap - val regularAggMap = regularAggExprs.map { - case ae @ AggregateExpression(af, _, _, filter, _) => - val newChildren = af.children.map(c => regularAggChildAttrLookup.getOrElse(c, c)) - val raf = af.withNewChildren(newChildren).asInstanceOf[AggregateFunction] - val filterOpt = filter.map(_.transform { - case a: Attribute => regularAggChildAttrLookup.getOrElse(a, a) - }) - val aggExpr = ae.copy(aggregateFunction = raf, filter = filterOpt) - (ae, aggExpr) - } + def rewrite(a: Aggregate): Aggregate = { - // Setup expand for the 'distinct' aggregate expressions. - val distinctAggExprs = distinctAggExpressions.filter(e => e.children.exists(!_.foldable)) - val (projections, expressionAttrs, aggExprPairs) = distinctAggExprs.map { - case ae @ AggregateExpression(af, _, _, filter, _) => - // Why do we need to construct the `exprId` ? - // First, In order to reduce costs, it is better to handle the filter clause locally. - // e.g. COUNT (DISTINCT a) FILTER (WHERE id > 1), evaluate expression - // If(id > 1) 'a else null first, and use the result as output. - // Second, If at least two DISTINCT aggregate expression which may references the - // same attributes. We need to construct the generated attributes so as the output not - // lost. e.g. SUM (DISTINCT a), COUNT (DISTINCT a) FILTER (WHERE id > 1) will output - // attribute '_gen_distinct-1 and attribute '_gen_distinct-2 instead of two 'a. - // Note: We just need to illusion the expression with filter clause. - // The illusionary mechanism may result in multiple distinct aggregations uses - // different column, so we still need to call `rewrite`. - val exprId = NamedExpression.newExprId.id - val unfoldableChildren = af.children.filter(!_.foldable) - val exprAttrs = unfoldableChildren.map { e => - (e, AttributeReference(s"_gen_distinct_$exprId", e.dataType, nullable = true)()) - } - val exprAttrLookup = exprAttrs.toMap - val newChildren = af.children.map(c => exprAttrLookup.getOrElse(c, c)) - val raf = af.withNewChildren(newChildren).asInstanceOf[AggregateFunction] - val aggExpr = ae.copy(aggregateFunction = raf, filter = None) - // Expand projection - val projection = unfoldableChildren.map { - case e if filter.isDefined => If(filter.get, e, nullify(e)) - case e => e - } - (projection, exprAttrs, (ae, aggExpr)) - }.unzip3 - val distinctAggChildAttrs = expressionAttrs.flatten.map(_._2) - val allAggAttrs = regularAggChildAttrMap.map(_._2) ++ distinctAggChildAttrs - // Construct the aggregate input projection. - val rewriteAggProjections = - Seq(a.groupingExpressions ++ regularAggChildren ++ projections.flatten) - val groupByMap = a.groupingExpressions.collect { - case ne: NamedExpression => ne -> ne.toAttribute - case e => e -> AttributeReference(e.sql, e.dataType, e.nullable)() - } - val groupByAttrs = groupByMap.map(_._2) - // Construct the expand operator. - val expand = Expand(rewriteAggProjections, groupByAttrs ++ allAggAttrs, a.child) - val rewriteAggExprLookup = (aggExprPairs ++ regularAggMap).toMap - val patchedAggExpressions = a.aggregateExpressions.map { e => - e.transformDown { - case ae: AggregateExpression => rewriteAggExprLookup.getOrElse(ae, ae) - }.asInstanceOf[NamedExpression] + // Collect all aggregate expressions. + val aggExpressions = a.aggregateExpressions.flatMap { e => + e.collect { + case ae: AggregateExpression => ae } - val expandAggregate = Aggregate(groupByAttrs, patchedAggExpressions, expand) - expandAggregate - } else { - a } - } - - private def rewriteDistinctAggregate(a: Aggregate): Aggregate = { - val aggExpressions = collectAggregateExprs(a) // Extract distinct aggregate expressions. val distinctAggGroups = aggExpressions.filter(_.isDistinct).groupBy { e => @@ -472,14 +331,6 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { } } - private def collectAggregateExprs(a: Aggregate): Seq[AggregateExpression] = { - a.aggregateExpressions.flatMap { e => - e.collect { - case ae: AggregateExpression => ae - } - } - } - private def nullify(e: Expression) = Literal.create(null, e.dataType) private def expressionAttributePair(e: Expression) = diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index d2be6944a2e37..bbd686203f176 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -190,6 +190,11 @@ class AnalysisErrorSuite extends AnalysisTest { "FILTER (WHERE c > 1)"), "FILTER predicate specified, but aggregate is not an aggregate function" :: Nil) + errorTest( + "DISTINCT and FILTER cannot be used in aggregate functions at the same time", + CatalystSqlParser.parsePlan("SELECT count(DISTINCT a) FILTER (WHERE c > 1) FROM TaBlE2"), + "DISTINCT and FILTER cannot be used in aggregate functions at the same time" :: Nil) + errorTest( "FILTER expression is non-deterministic, it cannot be used in aggregate functions", CatalystSqlParser.parsePlan("SELECT count(a) FILTER (WHERE rand(int(c)) > 1) FROM TaBlE2"), diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql index 14f0eb70657da..beb5b9e5fe516 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql @@ -33,10 +33,8 @@ SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp; SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM emp; SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) FROM emp; SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") = "2001-01-01") FROM emp; -SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") = "2001-01-01 00:00:00") FROM emp; -SELECT COUNT(DISTINCT id), COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") = "2001-01-01 00:00:00") FROM emp; -SELECT COUNT(DISTINCT id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")), COUNT(DISTINCT id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM emp; -SELECT SUM(salary), COUNT(DISTINCT id), COUNT(DISTINCT id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") = "2001-01-01 00:00:00") FROM emp; -- Aggregate with filter and non-empty GroupBy expressions. SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a; @@ -46,10 +44,8 @@ SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM emp SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) FROM emp GROUP BY dept_id; SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 00:00:00")) FROM emp GROUP BY dept_id; SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") > "2003-01-01") FROM emp GROUP BY dept_id; -SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id; -SELECT dept_id, SUM(DISTINCT salary), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id; -SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE hiredate > date "2001-01-01"), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id; -SELECT dept_id, COUNT(id), SUM(DISTINCT salary), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") > "2001-01-01") FROM emp GROUP BY dept_id; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id; -- Aggregate with filter and grouped by literals. SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1; @@ -62,21 +58,13 @@ select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id; select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id; select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id + dept_id > 500), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id > 200), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id + dept_id > 500), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id > 200), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id; -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id + dept_id > 500), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name), sum(salary) from emp group by dept_id; -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; -select dept_id, sum(distinct (id + dept_id))) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id; -select dept_id, sum(distinct (id + dept_id)) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; -select dept_id, avg(distinct (id + dept_id)) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; -- Aggregate with filter and grouped by literals (hash aggregate), here the input table is filtered using WHERE. SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData WHERE a = 0 GROUP BY 1; @@ -90,8 +78,9 @@ SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1; SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1; -- Aggregate with filter, foldable input and multiple distinct groups. -SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER (WHERE b > 0 AND c > 2) -FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER (WHERE b > 0 AND c > 2) +-- FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; -- Check analysis exceptions SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql index 657ea59ec8f11..746b677234832 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql @@ -241,9 +241,10 @@ select sum(1/ten) filter (where ten > 0) from tenk1; -- select ten, sum(distinct four) filter (where four::text ~ '123') from onek a -- group by ten; -select ten, sum(distinct four) filter (where four > 10) from onek a -group by ten -having exists (select 1 from onek b where sum(distinct a.four) = b.four); +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- select ten, sum(distinct four) filter (where four > 10) from onek a +-- group by ten +-- having exists (select 1 from onek b where sum(distinct a.four) = b.four); -- [SPARK-28682] ANSI SQL: Collation Support -- select max(foo COLLATE "C") filter (where (bar collate "POSIX") > '0') diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql index 45617c53166aa..fc54d179f742c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql @@ -336,8 +336,9 @@ order by 2,1; -- order by 2,1; -- FILTER queries -select ten, sum(distinct four) filter (where string(four) like '123') from onek a -group by rollup(ten); +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- select ten, sum(distinct four) filter (where string(four) like '123') from onek a +-- group by rollup(ten); -- More rescan tests -- [SPARK-27877] ANSI SQL: LATERAL derived table(T491) diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out index 079b651445f62..a032678e90fe8 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 61 +-- Number of queries: 37 -- !query @@ -94,38 +94,6 @@ struct --- !query output -2 - - --- !query -SELECT COUNT(DISTINCT id), COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") = "2001-01-01 00:00:00") FROM emp --- !query schema -struct --- !query output -8 2 - - --- !query -SELECT COUNT(DISTINCT id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")), COUNT(DISTINCT id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM emp --- !query schema -struct --- !query output -2 2 - - --- !query -SELECT SUM(salary), COUNT(DISTINCT id), COUNT(DISTINCT id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp --- !query schema -struct --- !query output -2450.0 8 2 - - -- !query SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a -- !query schema @@ -209,58 +177,6 @@ struct "2001-01-01 00:00:00") FROM emp GROUP BY dept_id --- !query schema -struct 2001-01-01 00:00:00)):double> --- !query output -10 300.0 -100 400.0 -20 300.0 -30 400.0 -70 150.0 -NULL NULL - - --- !query -SELECT dept_id, SUM(DISTINCT salary), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id --- !query schema -struct 2001-01-01 00:00:00)):double> --- !query output -10 300.0 300.0 -100 400.0 400.0 -20 300.0 300.0 -30 400.0 400.0 -70 150.0 150.0 -NULL 400.0 NULL - - --- !query -SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE hiredate > date "2001-01-01"), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id --- !query schema -struct DATE '2001-01-01')):double,sum(DISTINCT salary) FILTER (WHERE (date_format(CAST(hiredate AS TIMESTAMP), yyyy-MM-dd HH:mm:ss) > 2001-01-01 00:00:00)):double> --- !query output -10 300.0 300.0 -100 400.0 400.0 -20 300.0 300.0 -30 400.0 400.0 -70 150.0 150.0 -NULL NULL NULL - - --- !query -SELECT dept_id, COUNT(id), SUM(DISTINCT salary), SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") > "2001-01-01") FROM emp GROUP BY dept_id --- !query schema -struct 2001-01-01)):double> --- !query output -10 3 300.0 300.0 -100 2 400.0 400.0 -20 1 300.0 300.0 -30 1 400.0 400.0 -70 1 150.0 150.0 -NULL 1 400.0 NULL - - -- !query SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1 -- !query schema @@ -345,202 +261,6 @@ struct 200), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,sum(salary):double> --- !query output -10 0 400.0 -100 2 800.0 -20 1 300.0 -30 1 400.0 -70 1 150.0 -NULL 1 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id + dept_id > 500), sum(salary) from emp group by dept_id --- !query schema -struct 500)):bigint,sum(salary):double> --- !query output -10 0 400.0 -100 2 800.0 -20 0 300.0 -30 0 400.0 -70 1 150.0 -NULL 0 400.0 - - --- !query -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id > 200), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,sum(salary):double> --- !query output -10 2 0 400.0 -100 2 2 800.0 -20 1 1 300.0 -30 1 1 400.0 -70 1 1 150.0 -NULL 1 1 400.0 - - --- !query -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id + dept_id > 500), sum(salary) from emp group by dept_id --- !query schema -struct 500)):bigint,sum(salary):double> --- !query output -10 2 0 400.0 -100 2 2 800.0 -20 1 0 300.0 -30 1 0 400.0 -70 1 1 150.0 -NULL 1 0 400.0 - - --- !query -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id > 200), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id --- !query schema -struct 200)):bigint,sum(salary):double,sum(salary) FILTER (WHERE (id > 200)):double> --- !query output -10 2 0 400.0 NULL -100 2 2 800.0 800.0 -20 1 1 300.0 300.0 -30 1 1 400.0 400.0 -70 1 1 150.0 150.0 -NULL 1 1 400.0 400.0 - - --- !query -select dept_id, count(distinct emp_name), count(distinct emp_name) filter (where id + dept_id > 500), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id --- !query schema -struct 500)):bigint,sum(salary):double,sum(salary) FILTER (WHERE (id > 200)):double> --- !query output -10 2 0 400.0 NULL -100 2 2 800.0 800.0 -20 1 0 300.0 300.0 -30 1 0 400.0 400.0 -70 1 1 150.0 150.0 -NULL 1 0 400.0 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT hiredate):bigint,sum(salary):double> --- !query output -10 0 2 400.0 -100 2 2 800.0 -20 1 1 300.0 -30 1 1 400.0 -70 1 1 150.0 -NULL 1 1 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT hiredate) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary):double> --- !query output -10 0 1 400.0 -100 2 1 800.0 -20 1 0 300.0 -30 1 1 400.0 -70 1 1 150.0 -NULL 1 0 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT hiredate) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary) FILTER (WHERE (salary < 400.0)):double> --- !query output -10 0 1 400.0 -100 2 1 NULL -20 1 0 300.0 -30 1 1 NULL -70 1 1 150.0 -NULL 1 0 NULL - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT hiredate) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary) FILTER (WHERE (salary < 400.0)):double,sum(salary) FILTER (WHERE (id > 200)):double> --- !query output -10 0 1 400.0 NULL -100 2 1 NULL 800.0 -20 1 0 300.0 300.0 -30 1 1 NULL 400.0 -70 1 1 150.0 150.0 -NULL 1 0 NULL 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT emp_name):bigint,sum(salary):double> --- !query output -10 0 2 400.0 -100 2 2 800.0 -20 1 1 300.0 -30 1 1 400.0 -70 1 1 150.0 -NULL 1 1 400.0 - - --- !query -select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT emp_name) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary):double> --- !query output -10 0 1 400.0 -100 2 1 800.0 -20 1 0 300.0 -30 1 1 400.0 -70 1 1 150.0 -NULL 1 0 400.0 - - --- !query -select dept_id, sum(distinct (id + dept_id))) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id --- !query schema -struct<> --- !query output -org.apache.spark.sql.catalyst.parser.ParseException - -mismatched input ')' expecting {, '(', ',', '.', '[', 'ADD', 'AFTER', 'ALL', 'ALTER', 'ANALYZE', 'AND', 'ANTI', 'ANY', 'ARCHIVE', 'ARRAY', 'AS', 'ASC', 'AT', 'AUTHORIZATION', 'BETWEEN', 'BOTH', 'BUCKET', 'BUCKETS', 'BY', 'CACHE', 'CASCADE', 'CASE', 'CAST', 'CHANGE', 'CHECK', 'CLEAR', 'CLUSTER', 'CLUSTERED', 'CODEGEN', 'COLLATE', 'COLLECTION', 'COLUMN', 'COLUMNS', 'COMMENT', 'COMMIT', 'COMPACT', 'COMPACTIONS', 'COMPUTE', 'CONCATENATE', 'CONSTRAINT', 'COST', 'CREATE', 'CROSS', 'CUBE', 'CURRENT', 'CURRENT_DATE', 'CURRENT_TIME', 'CURRENT_TIMESTAMP', 'CURRENT_USER', 'DATA', 'DATABASE', DATABASES, 'DAY', 'DBPROPERTIES', 'DEFINED', 'DELETE', 'DELIMITED', 'DESC', 'DESCRIBE', 'DFS', 'DIRECTORIES', 'DIRECTORY', 'DISTINCT', 'DISTRIBUTE', 'DROP', 'ELSE', 'END', 'ESCAPE', 'ESCAPED', 'EXCEPT', 'EXCHANGE', 'EXISTS', 'EXPLAIN', 'EXPORT', 'EXTENDED', 'EXTERNAL', 'EXTRACT', 'FALSE', 'FETCH', 'FIELDS', 'FILTER', 'FILEFORMAT', 'FIRST', 'FIRST_VALUE', 'FOLLOWING', 'FOR', 'FOREIGN', 'FORMAT', 'FORMATTED', 'FROM', 'FULL', 'FUNCTION', 'FUNCTIONS', 'GLOBAL', 'GRANT', 'GROUP', 'GROUPING', 'HAVING', 'HOUR', 'IF', 'IGNORE', 'IMPORT', 'IN', 'INDEX', 'INDEXES', 'INNER', 'INPATH', 'INPUTFORMAT', 'INSERT', 'INTERSECT', 'INTERVAL', 'INTO', 'IS', 'ITEMS', 'JOIN', 'KEYS', 'LAST', 'LAST_VALUE', 'LATERAL', 'LAZY', 'LEADING', 'LEFT', 'LIKE', 'LIMIT', 'LINES', 'LIST', 'LOAD', 'LOCAL', 'LOCATION', 'LOCK', 'LOCKS', 'LOGICAL', 'MACRO', 'MAP', 'MATCHED', 'MERGE', 'MINUTE', 'MONTH', 'MSCK', 'NAMESPACE', 'NAMESPACES', 'NATURAL', 'NO', NOT, 'NULL', 'NULLS', 'OF', 'ON', 'ONLY', 'OPTION', 'OPTIONS', 'OR', 'ORDER', 'OUT', 'OUTER', 'OUTPUTFORMAT', 'OVER', 'OVERLAPS', 'OVERLAY', 'OVERWRITE', 'PARTITION', 'PARTITIONED', 'PARTITIONS', 'PERCENT', 'PIVOT', 'PLACING', 'POSITION', 'PRECEDING', 'PRIMARY', 'PRINCIPALS', 'PROPERTIES', 'PURGE', 'QUERY', 'RANGE', 'RECORDREADER', 'RECORDWRITER', 'RECOVER', 'REDUCE', 'REFERENCES', 'REFRESH', 'RENAME', 'REPAIR', 'REPLACE', 'RESET', 'RESPECT', 'RESTRICT', 'REVOKE', 'RIGHT', RLIKE, 'ROLE', 'ROLES', 'ROLLBACK', 'ROLLUP', 'ROW', 'ROWS', 'SCHEMA', 'SECOND', 'SELECT', 'SEMI', 'SEPARATED', 'SERDE', 'SERDEPROPERTIES', 'SESSION_USER', 'SET', 'MINUS', 'SETS', 'SHOW', 'SKEWED', 'SOME', 'SORT', 'SORTED', 'START', 'STATISTICS', 'STORED', 'STRATIFY', 'STRUCT', 'SUBSTR', 'SUBSTRING', 'TABLE', 'TABLES', 'TABLESAMPLE', 'TBLPROPERTIES', TEMPORARY, 'TERMINATED', 'THEN', 'TO', 'TOUCH', 'TRAILING', 'TRANSACTION', 'TRANSACTIONS', 'TRANSFORM', 'TRIM', 'TRUE', 'TRUNCATE', 'TYPE', 'UNARCHIVE', 'UNBOUNDED', 'UNCACHE', 'UNION', 'UNIQUE', 'UNKNOWN', 'UNLOCK', 'UNSET', 'UPDATE', 'USE', 'USER', 'USING', 'VALUES', 'VIEW', 'WHEN', 'WHERE', 'WINDOW', 'WITH', 'YEAR', EQ, '<=>', '<>', '!=', '<', LTE, '>', GTE, '+', '-', '*', '/', '%', 'DIV', '&', '|', '||', '^', IDENTIFIER, BACKQUOTED_IDENTIFIER}(line 1, pos 44) - -== SQL == -select dept_id, sum(distinct (id + dept_id))) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id ---------------------------------------------^^^ - - --- !query -select dept_id, sum(distinct (id + dept_id)) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id --- !query schema -struct 200)):bigint,count(DISTINCT hiredate) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary):double> --- !query output -10 NULL 1 400.0 -100 1500 1 800.0 -20 320 0 300.0 -30 430 1 400.0 -70 870 1 150.0 -NULL NULL 0 400.0 - - --- !query -select dept_id, avg(distinct (id + dept_id)) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id --- !query schema -struct 200)):double,count(DISTINCT hiredate) FILTER (WHERE (hiredate > DATE '2003-01-01')):bigint,sum(salary) FILTER (WHERE (salary < 400.0)):double> --- !query output -10 NULL 1 400.0 -100 750.0 1 NULL -20 320.0 0 300.0 -30 430.0 1 NULL -70 870.0 1 150.0 -NULL NULL 0 NULL - - -- !query SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData WHERE a = 0 GROUP BY 1 -- !query schema @@ -589,15 +309,6 @@ struct<((a + 1) + 1):int,count(b) FILTER (WHERE (b > 0)):bigint> NULL 1 --- !query -SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER (WHERE b > 0 AND c > 2) -FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a --- !query schema -struct 0)):bigint,count(DISTINCT b, c) FILTER (WHERE ((b > 0) AND (c > 2))):bigint> --- !query output -1 1 - - -- !query SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out index e1f735e5fe1dc..69f96b02782e3 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 5 +-- Number of queries: 4 -- !query @@ -27,20 +27,6 @@ struct 0)):d 2828.9682539682954 --- !query -select ten, sum(distinct four) filter (where four > 10) from onek a -group by ten -having exists (select 1 from onek b where sum(distinct a.four) = b.four) --- !query schema -struct 10)):bigint> --- !query output -0 NULL -2 NULL -4 NULL -6 NULL -8 NULL - - -- !query select (select count(*) from (values (1)) t0(inner_c)) diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out index 1ee653ad67bb7..24fd9dcbfc826 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 55 +-- Number of queries: 54 -- !query @@ -443,25 +443,6 @@ struct NULL 1 --- !query -select ten, sum(distinct four) filter (where string(four) like '123') from onek a -group by rollup(ten) --- !query schema -struct --- !query output -0 NULL -1 NULL -2 NULL -3 NULL -4 NULL -5 NULL -6 NULL -7 NULL -8 NULL -9 NULL -NULL NULL - - -- !query select count(*) from gstest4 group by rollup(unhashable_col,unsortable_col) -- !query schema