From d0c84cc3f585906b7bdcfe7c074a95f97da20275 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 20 Jan 2024 00:05:31 -0800 Subject: [PATCH] aggregate_statistics should only optimize MIN/MAX when relation is not empty (#8914) * Fix aggregate_statistics * Add more test --- .../aggregate_statistics.rs | 62 +++++++++++++------ .../sqllogictest/test_files/aggregate.slt | 34 ++++++++++ 2 files changed, 76 insertions(+), 20 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs index 86a8cdb7b3d4..0a53c775aa89 100644 --- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs @@ -197,17 +197,28 @@ fn take_optimizable_min( agg_expr: &dyn AggregateExpr, stats: &Statistics, ) -> Option<(ScalarValue, String)> { - let col_stats = &stats.column_statistics; - if let Some(casted_expr) = agg_expr.as_any().downcast_ref::() { - if casted_expr.expressions().len() == 1 { - // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Precision::Exact(num_rows) = &stats.num_rows { + if *num_rows > 0 { + let col_stats = &stats.column_statistics; + if let Some(casted_expr) = + agg_expr.as_any().downcast_ref::() { - if let Precision::Exact(val) = &col_stats[col_expr.index()].min_value { - if !val.is_null() { - return Some((val.clone(), casted_expr.name().to_string())); + if casted_expr.expressions().len() == 1 { + // TODO optimize with exprs other than Column + if let Some(col_expr) = casted_expr.expressions()[0] + .as_any() + .downcast_ref::() + { + if let Precision::Exact(val) = + &col_stats[col_expr.index()].min_value + { + if !val.is_null() { + return Some(( + val.clone(), + casted_expr.name().to_string(), + )); + } + } } } } @@ -221,17 +232,28 @@ fn take_optimizable_max( agg_expr: &dyn AggregateExpr, stats: &Statistics, ) -> Option<(ScalarValue, String)> { - let col_stats = &stats.column_statistics; - if let Some(casted_expr) = agg_expr.as_any().downcast_ref::() { - if casted_expr.expressions().len() == 1 { - // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Precision::Exact(num_rows) = &stats.num_rows { + if *num_rows > 0 { + let col_stats = &stats.column_statistics; + if let Some(casted_expr) = + agg_expr.as_any().downcast_ref::() { - if let Precision::Exact(val) = &col_stats[col_expr.index()].max_value { - if !val.is_null() { - return Some((val.clone(), casted_expr.name().to_string())); + if casted_expr.expressions().len() == 1 { + // TODO optimize with exprs other than Column + if let Some(col_expr) = casted_expr.expressions()[0] + .as_any() + .downcast_ref::() + { + if let Precision::Exact(val) = + &col_stats[col_expr.index()].max_value + { + if !val.is_null() { + return Some(( + val.clone(), + casted_expr.name().to_string(), + )); + } + } } } } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 50cdebd054a7..a098c8de0d3c 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3260,3 +3260,37 @@ query I select count(*) from (select count(*) a, count(*) b from (select 1)); ---- 1 + +# rule `aggregate_statistics` should not optimize MIN/MAX to wrong values on empty relation + +statement ok +CREATE TABLE empty(col0 INTEGER); + +query I +SELECT MIN(col0) FROM empty WHERE col0=1; +---- +NULL + +query I +SELECT MAX(col0) FROM empty WHERE col0=1; +---- +NULL + +statement ok +DROP TABLE empty; + +statement ok +CREATE TABLE t(col0 INTEGER) as VALUES(2); + +query I +SELECT MIN(col0) FROM t WHERE col0=1; +---- +NULL + +query I +SELECT MAX(col0) FROM t WHERE col0=1; +---- +NULL + +statement ok +DROP TABLE t;