-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add guarantees to simplification #7467
Changes from 10 commits
44d1b48
4c1c3a9
2134f2f
caa738f
ff7ed70
a6b57e3
a78f837
011f176
4bd9b60
16d78c6
bffb137
e4427a3
f4e8680
a28d5eb
b50df80
2452957
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,10 +39,14 @@ use datafusion_expr::{ | |
and, expr, lit, or, BinaryExpr, BuiltinScalarFunction, Case, ColumnarValue, Expr, | ||
Like, Volatility, | ||
}; | ||
use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps}; | ||
use datafusion_physical_expr::{ | ||
create_physical_expr, execution_props::ExecutionProps, intervals::NullableInterval, | ||
}; | ||
|
||
use crate::simplify_expressions::SimplifyInfo; | ||
|
||
use crate::simplify_expressions::guarantees::GuaranteeRewriter; | ||
|
||
/// This structure handles API for expression simplification | ||
pub struct ExprSimplifier<S> { | ||
info: S, | ||
|
@@ -149,6 +153,76 @@ impl<S: SimplifyInfo> ExprSimplifier<S> { | |
|
||
expr.rewrite(&mut expr_rewrite) | ||
} | ||
|
||
/// Input guarantees and simplify the expression. | ||
/// | ||
/// The guarantees can simplify expressions. For example, if a column `x` is | ||
/// guaranteed to be `3`, then the expression `x > 1` can be replaced by the | ||
/// literal `true`. | ||
/// | ||
/// The guarantees are provided as an iterator of `(Expr, NullableInterval)` | ||
/// pairs, where the [Expr] is a column reference and the [NullableInterval] | ||
/// is an interval representing the known possible values of that column. | ||
/// | ||
/// ```rust | ||
/// use arrow::datatypes::{DataType, Field, Schema}; | ||
/// use datafusion_expr::{col, lit, Expr}; | ||
/// use datafusion_common::{Result, ScalarValue, ToDFSchema}; | ||
/// use datafusion_physical_expr::execution_props::ExecutionProps; | ||
/// use datafusion_physical_expr::intervals::{Interval, NullableInterval}; | ||
/// use datafusion_optimizer::simplify_expressions::{ | ||
/// ExprSimplifier, SimplifyContext}; | ||
/// | ||
/// let schema = Schema::new(vec![ | ||
/// Field::new("x", DataType::Int64, false), | ||
/// Field::new("y", DataType::UInt32, false), | ||
/// Field::new("z", DataType::Int64, false), | ||
/// ]) | ||
/// .to_dfschema_ref().unwrap(); | ||
/// | ||
/// // Create the simplifier | ||
/// let props = ExecutionProps::new(); | ||
/// let context = SimplifyContext::new(&props) | ||
/// .with_schema(schema); | ||
/// let simplifier = ExprSimplifier::new(context); | ||
/// | ||
/// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is really cool @wjones127 |
||
/// let expr_x = col("x").gt_eq(lit(3_i64)); | ||
/// let expr_y = (col("y") + lit(2_u32)).lt(lit(10_u32)); | ||
/// let expr_z = col("z").gt(lit(5_i64)); | ||
/// let expr = expr_x.and(expr_y).and(expr_z.clone()); | ||
/// | ||
/// let guarantees = vec![ | ||
/// // x ∈ [3, 5] | ||
/// ( | ||
/// col("x"), | ||
/// NullableInterval { | ||
/// values: Interval::make(Some(3_i64), Some(5_i64), (false, false)), | ||
/// is_valid: Interval::CERTAINLY_TRUE, | ||
/// } | ||
/// ), | ||
/// // y = 3 | ||
/// (col("y"), NullableInterval::from(&ScalarValue::UInt32(Some(3)))), | ||
/// ]; | ||
/// let output = simplifier.simplify_with_guarantees(expr, &guarantees).unwrap(); | ||
/// // Expression becomes: true AND true AND (z > 5), which simplifies to | ||
/// // z > 5. | ||
/// assert_eq!(output, expr_z); | ||
/// ``` | ||
pub fn simplify_with_guarantees<'a>( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another potential API for this might be to store the guarantees on the simplifier -- like let expr = ExprSimplifier::new(context)
.with_guarantees(guarantees)
.simplify()? The downside is that the guarantees would have to be owned (aka a So I think this API is fine, I just wanted to mention the possibility There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That doesn't seem to bad, I think. My imagined use case is that we re-use the same simplifier with different guarantees but the same predicate. Something like: let mut simplifier = ExprSimplifier::new(context);
for row_group in file {
let guarantees = get_guarantees(row_groups.statistics);
simplifier = simplifier.with_guarantees(guarantees);
let group_predicate = simplifier.simplify(predicate);
// Do something with the predicate
} So my main concern is that it's performant if handled in a loop like that. I think it should be. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Switched to this API. |
||
&self, | ||
expr: Expr, | ||
guarantees: impl IntoIterator<Item = &'a (Expr, NullableInterval)>, | ||
) -> Result<Expr> { | ||
// Do a simplification pass in case it reveals places where a guarantee | ||
// could be applied. | ||
let expr = self.simplify(expr)?; | ||
let mut rewriter = GuaranteeRewriter::new(guarantees); | ||
let expr = expr.rewrite(&mut rewriter)?; | ||
// Simplify after guarantees are applied, since constant folding should | ||
// now be able to fold more expressions. | ||
self.simplify(expr) | ||
} | ||
} | ||
|
||
#[allow(rustdoc::private_intra_doc_links)] | ||
|
@@ -1211,7 +1285,9 @@ mod tests { | |
use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema}; | ||
use datafusion_expr::*; | ||
use datafusion_physical_expr::{ | ||
execution_props::ExecutionProps, functions::make_scalar_function, | ||
execution_props::ExecutionProps, | ||
functions::make_scalar_function, | ||
intervals::{Interval, NullableInterval}, | ||
}; | ||
|
||
// ------------------------------ | ||
|
@@ -2675,6 +2751,20 @@ mod tests { | |
try_simplify(expr).unwrap() | ||
} | ||
|
||
fn simplify_with_guarantee( | ||
expr: Expr, | ||
guarantees: &[(Expr, NullableInterval)], | ||
) -> Expr { | ||
let schema = expr_test_schema(); | ||
let execution_props = ExecutionProps::new(); | ||
let simplifier = ExprSimplifier::new( | ||
SimplifyContext::new(&execution_props).with_schema(schema), | ||
); | ||
simplifier | ||
.simplify_with_guarantees(expr, guarantees) | ||
.unwrap() | ||
} | ||
|
||
fn expr_test_schema() -> DFSchemaRef { | ||
Arc::new( | ||
DFSchema::new_with_metadata( | ||
|
@@ -3138,4 +3228,93 @@ mod tests { | |
let expr = not_ilike(null, "%"); | ||
assert_eq!(simplify(expr), lit_bool_null()); | ||
} | ||
|
||
#[test] | ||
fn test_simplify_with_guarantee() { | ||
// (c3 >= 3) AND (c4 + 2 < 10 OR (c1 NOT IN ("a", "b"))) | ||
let expr_x = col("c3").gt(lit(3_i64)); | ||
let expr_y = (col("c4") + lit(2_u32)).lt(lit(10_u32)); | ||
let expr_z = col("c1").in_list(vec![lit("a"), lit("b")], true); | ||
let expr = expr_x.clone().and(expr_y.clone().or(expr_z)); | ||
|
||
// All guaranteed null | ||
let guarantees = vec![ | ||
(col("c3"), NullableInterval::from(&ScalarValue::Int64(None))), | ||
( | ||
col("c4"), | ||
NullableInterval::from(&ScalarValue::UInt32(None)), | ||
), | ||
(col("c1"), NullableInterval::from(&ScalarValue::Utf8(None))), | ||
]; | ||
|
||
let output = simplify_with_guarantee(expr.clone(), &guarantees); | ||
assert_eq!(output, lit_bool_null()); | ||
|
||
// All guaranteed false | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval { | ||
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)), | ||
is_valid: Interval::CERTAINLY_TRUE, | ||
}, | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval::from(&ScalarValue::UInt32(Some(9))), | ||
), | ||
( | ||
col("c1"), | ||
NullableInterval::from(&ScalarValue::Utf8(Some("a".to_string()))), | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), &guarantees); | ||
assert_eq!(output, lit(false)); | ||
|
||
// Guaranteed false or null -> no change. | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval { | ||
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)), | ||
is_valid: Interval::UNCERTAIN, | ||
}, | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval { | ||
values: Interval::make(Some(9_u32), Some(9_u32), (false, false)), | ||
is_valid: Interval::UNCERTAIN, | ||
}, | ||
), | ||
( | ||
col("c1"), | ||
NullableInterval::from(&ScalarValue::Utf8(Some("a".to_string()))), | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), &guarantees); | ||
assert_eq!(output, expr_x.clone().and(expr_y.clone())); | ||
|
||
// Sufficient true guarantees | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval::from(&ScalarValue::Int64(Some(9))), | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval::from(&ScalarValue::UInt32(Some(3))), | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), &guarantees); | ||
assert_eq!(output, lit(true)); | ||
|
||
// Only partially simplify | ||
let guarantees = vec![( | ||
col("c4"), | ||
NullableInterval::from(&ScalarValue::UInt32(Some(3))), | ||
)]; | ||
let output = simplify_with_guarantee(expr.clone(), &guarantees); | ||
assert_eq!(&output, &expr_x); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
❤️