From d30b624c806b2b07eeff2bc34911d6df964ed340 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 15 Apr 2025 10:50:47 +0100 Subject: [PATCH 1/2] support OR operator in binary `evaluate_bounds` --- .../expr-common/src/interval_arithmetic.rs | 3 +- .../physical-expr/src/expressions/binary.rs | 94 +++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 6af4322df29e..602c1bbdf720 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -606,7 +606,7 @@ impl Interval { upper: ScalarValue::Boolean(Some(upper)), }) } - _ => internal_err!("Incompatible data types for logical conjunction"), + _ => internal_err!("Incompatible data types for logical disjunction"), } } @@ -959,6 +959,7 @@ pub fn apply_operator(op: &Operator, lhs: &Interval, rhs: &Interval) -> Result lhs.lt(rhs), Operator::LtEq => lhs.lt_eq(rhs), Operator::And => lhs.and(rhs), + Operator::Or => lhs.or(rhs), Operator::Plus => lhs.add(rhs), Operator::Minus => lhs.sub(rhs), Operator::Multiply => lhs.mul(rhs), diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 84374f4a2970..2ec5a1b54e7a 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -4927,4 +4927,98 @@ mod tests { let left_value = left_expr.evaluate(&batch).unwrap(); assert!(!check_short_circuit(&left_value, &Operator::Or)); } + + #[test] + fn test_evaluate_bounds_int32() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ]); + + let a = Arc::new(Column::new("a", 0)) as _; + let b = Arc::new(Column::new("b", 1)) as _; + + // Test addition bounds + let add_expr = + binary_expr(Arc::clone(&a), Operator::Plus, Arc::clone(&b), &schema).unwrap(); + let add_bounds = add_expr + .evaluate_bounds(&[ + &Interval::make(Some(1), Some(10)).unwrap(), + &Interval::make(Some(5), Some(15)).unwrap(), + ]) + .unwrap(); + assert_eq!(add_bounds, Interval::make(Some(6), Some(25)).unwrap()); + + // Test subtraction bounds + let sub_expr = + binary_expr(Arc::clone(&a), Operator::Minus, Arc::clone(&b), &schema) + .unwrap(); + let sub_bounds = sub_expr + .evaluate_bounds(&[ + &Interval::make(Some(1), Some(10)).unwrap(), + &Interval::make(Some(5), Some(15)).unwrap(), + ]) + .unwrap(); + assert_eq!(sub_bounds, Interval::make(Some(-14), Some(5)).unwrap()); + + // Test multiplication bounds + let mul_expr = + binary_expr(Arc::clone(&a), Operator::Multiply, Arc::clone(&b), &schema) + .unwrap(); + let mul_bounds = mul_expr + .evaluate_bounds(&[ + &Interval::make(Some(1), Some(10)).unwrap(), + &Interval::make(Some(5), Some(15)).unwrap(), + ]) + .unwrap(); + assert_eq!(mul_bounds, Interval::make(Some(5), Some(150)).unwrap()); + + // Test division bounds + let div_expr = + binary_expr(Arc::clone(&a), Operator::Divide, Arc::clone(&b), &schema) + .unwrap(); + let div_bounds = div_expr + .evaluate_bounds(&[ + &Interval::make(Some(10), Some(20)).unwrap(), + &Interval::make(Some(2), Some(5)).unwrap(), + ]) + .unwrap(); + assert_eq!(div_bounds, Interval::make(Some(2), Some(10)).unwrap()); + } + + #[test] + fn test_evaluate_bounds_bool() { + let schema = Schema::new(vec![ + Field::new("a", DataType::Boolean, false), + Field::new("b", DataType::Boolean, false), + ]); + + let a = Arc::new(Column::new("a", 0)) as _; + let b = Arc::new(Column::new("b", 1)) as _; + + // Test OR bounds + let or_expr = + binary_expr(Arc::clone(&a), Operator::Or, Arc::clone(&b), &schema).unwrap(); + let or_bounds = or_expr + .evaluate_bounds(&[ + &Interval::make(Some(true), Some(true)).unwrap(), + &Interval::make(Some(false), Some(false)).unwrap(), + ]) + .unwrap(); + assert_eq!(or_bounds, Interval::make(Some(true), Some(true)).unwrap()); + + // Test AND bounds + let and_expr = + binary_expr(Arc::clone(&a), Operator::And, Arc::clone(&b), &schema).unwrap(); + let and_bounds = and_expr + .evaluate_bounds(&[ + &Interval::make(Some(true), Some(true)).unwrap(), + &Interval::make(Some(false), Some(false)).unwrap(), + ]) + .unwrap(); + assert_eq!( + and_bounds, + Interval::make(Some(false), Some(false)).unwrap() + ); + } } From f5092b64c16eae79409adbc1ac90fbdf06a0a6d5 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 29 Apr 2025 19:09:28 +0100 Subject: [PATCH 2/2] fixup tests --- datafusion-examples/examples/expr_api.rs | 7 ++++--- datafusion/physical-expr/src/analysis.rs | 4 ++-- .../physical-expr/src/expressions/binary.rs | 2 +- .../physical-expr/src/intervals/cp_solver.rs | 15 +++++++++++++-- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index b61a350a5a9c..0dccf2d48d01 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -424,7 +424,7 @@ fn boundary_analysis_in_conjuctions_demo() -> Result<()> { // // But `AND` conjunctions are easier to reason with because their interval // arithmetic follows naturally from set intersection operations, let us - // now look at an example that is a tad more complicated `OR` conjunctions. + // now look at an example that is a tad more complicated `OR` disjunctions. // The expression we will look at is `age > 60 OR age <= 18`. let age_greater_than_60_less_than_18 = @@ -435,7 +435,7 @@ fn boundary_analysis_in_conjuctions_demo() -> Result<()> { // // Initial range: [14, 79] as described in our column statistics. // - // From the left-hand side and right-hand side of our `OR` conjunctions + // From the left-hand side and right-hand side of our `OR` disjunctions // we end up with two ranges, instead of just one. // // - age > 60: [61, 79] @@ -446,7 +446,8 @@ fn boundary_analysis_in_conjuctions_demo() -> Result<()> { let physical_expr = SessionContext::new() .create_physical_expr(age_greater_than_60_less_than_18, &df_schema)?; - // Since we don't handle interval arithmetic for `OR` operator this will error out. + // However, analysis only supports a single interval, so we don't yet deal + // with the multiple possibilities of the `OR` disjunctions. let analysis = analyze( &physical_expr, AnalysisContext::new(initial_boundaries), diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index 5abd50f6d1b4..cf7db10bb7a2 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -100,7 +100,7 @@ impl ExprBoundaries { ) -> Result { let field = schema.fields().get(col_index).ok_or_else(|| { internal_datafusion_err!( - "Could not create `ExprBoundaries`: in `try_from_column` `col_index` + "Could not create `ExprBoundaries`: in `try_from_column` `col_index` has gone out of bounds with a value of {col_index}, the schema has {} columns.", schema.fields.len() ) @@ -425,7 +425,7 @@ mod tests { fn test_analyze_invalid_boundary_exprs() { let schema = Arc::new(Schema::new(vec![make_field("a", DataType::Int32)])); let expr = col("a").lt(lit(10)).or(col("a").gt(lit(20))); - let expected_error = "Interval arithmetic does not support the operator OR"; + let expected_error = "OR operator cannot yet propagate true intervals"; let boundaries = ExprBoundaries::try_new_unbounded(&schema).unwrap(); let df_schema = DFSchema::try_from(Arc::clone(&schema)).unwrap(); let physical_expr = diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 1e0a572f6024..798e68a459ce 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -516,7 +516,7 @@ impl PhysicalExpr for BinaryExpr { } } else if self.op.eq(&Operator::Or) { if interval.eq(&Interval::CERTAINLY_FALSE) { - // A certainly false logical conjunction can only derive from certainly + // A certainly false logical disjunction can only derive from certainly // false operands. Otherwise, we prove infeasibility. Ok((!left_interval.eq(&Interval::CERTAINLY_TRUE) && !right_interval.eq(&Interval::CERTAINLY_TRUE)) diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index a53814c3ad2b..bcd67bf8691f 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -148,12 +148,12 @@ use std::sync::Arc; use super::utils::{ convert_duration_type_to_interval, convert_interval_type_to_duration, get_inverse_op, }; -use crate::expressions::Literal; +use crate::expressions::{BinaryExpr, Literal}; use crate::utils::{build_dag, ExprTreeNode}; use crate::PhysicalExpr; use arrow::datatypes::{DataType, Schema}; -use datafusion_common::{internal_err, Result}; +use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval}; use datafusion_expr::Operator; @@ -645,6 +645,17 @@ impl ExprIntervalGraph { .map(|child| self.graph[*child].interval()) .collect::>(); let node_interval = self.graph[node].interval(); + // Special case: true OR could in principle be propagated by 3 interval sets, + // (i.e. left true, or right true, or both true) however we do not support this yet. + if node_interval == &Interval::CERTAINLY_TRUE + && self.graph[node] + .expr + .as_any() + .downcast_ref::() + .is_some_and(|expr| expr.op() == &Operator::Or) + { + return not_impl_err!("OR operator cannot yet propagate true intervals"); + } let propagated_intervals = self.graph[node] .expr .propagate_constraints(node_interval, &children_intervals)?;