-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add guarantees to simplification #7467
Merged
Merged
Changes from all commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
44d1b48
feat: add guarantees to simplifcation
wjones127 4c1c3a9
null and comparison support
wjones127 2134f2f
add support for literal expressions
wjones127 caa738f
implement inlist guarantee use
wjones127 ff7ed70
test the outer function
wjones127 a6b57e3
docs
wjones127 a78f837
refactor to use intervals
wjones127 011f176
add high-level test
wjones127 4bd9b60
cleanup
wjones127 16d78c6
fix test to be false or null, not true
wjones127 bffb137
refactor: change NullableInterval to an enum
wjones127 e4427a3
refactor: use a builder-like API
wjones127 f4e8680
pr feedback
wjones127 a28d5eb
Merge remote-tracking branch 'apache/main' into 6171-simplify-with-gu…
alamb b50df80
Fix clippy
alamb 2452957
fix doc links
alamb File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,13 +39,20 @@ use datafusion_expr::{ | |
and, expr, lit, or, BinaryExpr, BuiltinScalarFunction, Case, ColumnarValue, Expr, | ||
Like, Volatility, | ||
}; | ||
use datafusion_physical_expr::{create_physical_expr, execution_props::ExecutionProps}; | ||
use datafusion_physical_expr::{ | ||
create_physical_expr, execution_props::ExecutionProps, intervals::NullableInterval, | ||
}; | ||
|
||
use crate::simplify_expressions::SimplifyInfo; | ||
|
||
use crate::simplify_expressions::guarantees::GuaranteeRewriter; | ||
|
||
/// This structure handles API for expression simplification | ||
pub struct ExprSimplifier<S> { | ||
info: S, | ||
/// Guarantees about the values of columns. This is provided by the user | ||
/// in [ExprSimplifier::with_guarantees()]. | ||
guarantees: Vec<(Expr, NullableInterval)>, | ||
} | ||
|
||
pub const THRESHOLD_INLINE_INLIST: usize = 3; | ||
|
@@ -57,7 +64,10 @@ impl<S: SimplifyInfo> ExprSimplifier<S> { | |
/// | ||
/// [`SimplifyContext`]: crate::simplify_expressions::context::SimplifyContext | ||
pub fn new(info: S) -> Self { | ||
Self { info } | ||
Self { | ||
info, | ||
guarantees: vec![], | ||
} | ||
} | ||
|
||
/// Simplifies this [`Expr`]`s as much as possible, evaluating | ||
|
@@ -121,6 +131,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> { | |
let mut simplifier = Simplifier::new(&self.info); | ||
let mut const_evaluator = ConstEvaluator::try_new(self.info.execution_props())?; | ||
let mut or_in_list_simplifier = OrInListSimplifier::new(); | ||
let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees); | ||
|
||
// TODO iterate until no changes are made during rewrite | ||
// (evaluating constants can enable new simplifications and | ||
|
@@ -129,6 +140,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> { | |
expr.rewrite(&mut const_evaluator)? | ||
.rewrite(&mut simplifier)? | ||
.rewrite(&mut or_in_list_simplifier)? | ||
.rewrite(&mut guarantee_rewriter)? | ||
// run both passes twice to try an minimize simplifications that we missed | ||
.rewrite(&mut const_evaluator)? | ||
.rewrite(&mut simplifier) | ||
|
@@ -149,6 +161,65 @@ impl<S: SimplifyInfo> ExprSimplifier<S> { | |
|
||
expr.rewrite(&mut expr_rewrite) | ||
} | ||
|
||
/// Input guarantees about the values of columns. | ||
/// | ||
/// The guarantees can simplify expressions. For example, if a column `x` is | ||
/// guaranteed to be `3`, then the expression `x > 1` can be replaced by the | ||
/// literal `true`. | ||
/// | ||
/// The guarantees are provided as a `Vec<(Expr, NullableInterval)>`, | ||
/// where the [Expr] is a column reference and the [NullableInterval] | ||
/// is an interval representing the known possible values of that column. | ||
/// | ||
/// ```rust | ||
/// use arrow::datatypes::{DataType, Field, Schema}; | ||
/// use datafusion_expr::{col, lit, Expr}; | ||
/// use datafusion_common::{Result, ScalarValue, ToDFSchema}; | ||
/// use datafusion_physical_expr::execution_props::ExecutionProps; | ||
/// use datafusion_physical_expr::intervals::{Interval, NullableInterval}; | ||
/// use datafusion_optimizer::simplify_expressions::{ | ||
/// ExprSimplifier, SimplifyContext}; | ||
/// | ||
/// let schema = Schema::new(vec![ | ||
/// Field::new("x", DataType::Int64, false), | ||
/// Field::new("y", DataType::UInt32, false), | ||
/// Field::new("z", DataType::Int64, false), | ||
/// ]) | ||
/// .to_dfschema_ref().unwrap(); | ||
/// | ||
/// // Create the simplifier | ||
/// let props = ExecutionProps::new(); | ||
/// let context = SimplifyContext::new(&props) | ||
/// .with_schema(schema); | ||
/// | ||
/// // Expression: (x >= 3) AND (y + 2 < 10) AND (z > 5) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is really cool @wjones127 |
||
/// let expr_x = col("x").gt_eq(lit(3_i64)); | ||
/// let expr_y = (col("y") + lit(2_u32)).lt(lit(10_u32)); | ||
/// let expr_z = col("z").gt(lit(5_i64)); | ||
/// let expr = expr_x.and(expr_y).and(expr_z.clone()); | ||
/// | ||
/// let guarantees = vec![ | ||
/// // x ∈ [3, 5] | ||
/// ( | ||
/// col("x"), | ||
/// NullableInterval::NotNull { | ||
/// values: Interval::make(Some(3_i64), Some(5_i64), (false, false)), | ||
/// } | ||
/// ), | ||
/// // y = 3 | ||
/// (col("y"), NullableInterval::from(ScalarValue::UInt32(Some(3)))), | ||
/// ]; | ||
/// let simplifier = ExprSimplifier::new(context).with_guarantees(guarantees); | ||
/// let output = simplifier.simplify(expr).unwrap(); | ||
/// // Expression becomes: true AND true AND (z > 5), which simplifies to | ||
/// // z > 5. | ||
/// assert_eq!(output, expr_z); | ||
/// ``` | ||
pub fn with_guarantees(mut self, guarantees: Vec<(Expr, NullableInterval)>) -> Self { | ||
self.guarantees = guarantees; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
self | ||
} | ||
} | ||
|
||
#[allow(rustdoc::private_intra_doc_links)] | ||
|
@@ -1239,7 +1310,9 @@ mod tests { | |
use datafusion_common::{assert_contains, cast::as_int32_array, DFField, ToDFSchema}; | ||
use datafusion_expr::*; | ||
use datafusion_physical_expr::{ | ||
execution_props::ExecutionProps, functions::make_scalar_function, | ||
execution_props::ExecutionProps, | ||
functions::make_scalar_function, | ||
intervals::{Interval, NullableInterval}, | ||
}; | ||
|
||
// ------------------------------ | ||
|
@@ -2703,6 +2776,19 @@ mod tests { | |
try_simplify(expr).unwrap() | ||
} | ||
|
||
fn simplify_with_guarantee( | ||
expr: Expr, | ||
guarantees: Vec<(Expr, NullableInterval)>, | ||
) -> Expr { | ||
let schema = expr_test_schema(); | ||
let execution_props = ExecutionProps::new(); | ||
let simplifier = ExprSimplifier::new( | ||
SimplifyContext::new(&execution_props).with_schema(schema), | ||
) | ||
.with_guarantees(guarantees); | ||
simplifier.simplify(expr).unwrap() | ||
} | ||
|
||
fn expr_test_schema() -> DFSchemaRef { | ||
Arc::new( | ||
DFSchema::new_with_metadata( | ||
|
@@ -3166,4 +3252,89 @@ mod tests { | |
let expr = not_ilike(null, "%"); | ||
assert_eq!(simplify(expr), lit_bool_null()); | ||
} | ||
|
||
#[test] | ||
fn test_simplify_with_guarantee() { | ||
// (c3 >= 3) AND (c4 + 2 < 10 OR (c1 NOT IN ("a", "b"))) | ||
let expr_x = col("c3").gt(lit(3_i64)); | ||
let expr_y = (col("c4") + lit(2_u32)).lt(lit(10_u32)); | ||
let expr_z = col("c1").in_list(vec![lit("a"), lit("b")], true); | ||
let expr = expr_x.clone().and(expr_y.clone().or(expr_z)); | ||
|
||
// All guaranteed null | ||
let guarantees = vec![ | ||
(col("c3"), NullableInterval::from(ScalarValue::Int64(None))), | ||
(col("c4"), NullableInterval::from(ScalarValue::UInt32(None))), | ||
(col("c1"), NullableInterval::from(ScalarValue::Utf8(None))), | ||
]; | ||
|
||
let output = simplify_with_guarantee(expr.clone(), guarantees); | ||
assert_eq!(output, lit_bool_null()); | ||
|
||
// All guaranteed false | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval::NotNull { | ||
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)), | ||
}, | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval::from(ScalarValue::UInt32(Some(9))), | ||
), | ||
( | ||
col("c1"), | ||
NullableInterval::from(ScalarValue::Utf8(Some("a".to_string()))), | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), guarantees); | ||
assert_eq!(output, lit(false)); | ||
|
||
// Guaranteed false or null -> no change. | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval::MaybeNull { | ||
values: Interval::make(Some(0_i64), Some(2_i64), (false, false)), | ||
}, | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval::MaybeNull { | ||
values: Interval::make(Some(9_u32), Some(9_u32), (false, false)), | ||
}, | ||
), | ||
( | ||
col("c1"), | ||
NullableInterval::NotNull { | ||
values: Interval::make(Some("d"), Some("f"), (false, false)), | ||
}, | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), guarantees); | ||
assert_eq!(&output, &expr_x); | ||
|
||
// Sufficient true guarantees | ||
let guarantees = vec![ | ||
( | ||
col("c3"), | ||
NullableInterval::from(ScalarValue::Int64(Some(9))), | ||
), | ||
( | ||
col("c4"), | ||
NullableInterval::from(ScalarValue::UInt32(Some(3))), | ||
), | ||
]; | ||
let output = simplify_with_guarantee(expr.clone(), guarantees); | ||
assert_eq!(output, lit(true)); | ||
|
||
// Only partially simplify | ||
let guarantees = vec![( | ||
col("c4"), | ||
NullableInterval::from(ScalarValue::UInt32(Some(3))), | ||
)]; | ||
let output = simplify_with_guarantee(expr.clone(), guarantees); | ||
assert_eq!(&output, &expr_x); | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think of such a loop to cover every simplification case and make it easier to accommodate future simplifications, or would it be unnecessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a neat idea. I think we should try it in a follow on PR.
ALso, If we did this I would also suggest adding a limit on the number of loops (to avoid a "ping-poing" infinite loop where passes rewrite an expression back and forth)