apache · mbutrovich · Aug 1, 2025 · Jul 23, 2025 · Jul 23, 2025 · Jul 23, 2025
diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/Cargo.toml b/native/Cargo.toml
@@ -34,12 +34,12 @@ edition = "2021"
 rust-version = "1.85"
 
 [workspace.dependencies]
-arrow = { version = "55.1.0", features = ["prettyprint", "ffi", "chrono-tz"] }
+arrow = { version = "55.2.0", features = ["prettyprint", "ffi", "chrono-tz"] }
 async-trait = { version = "0.1" }
 bytes = { version = "1.10.0" }
-parquet = { version = "55.1.0", default-features = false, features = ["experimental"] }
-datafusion = { version = "48.0.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
-datafusion-spark = { version = "48.0.0" }
+parquet = { version = "55.2.0", default-features = false, features = ["experimental"] }
+datafusion = { version = "49.0.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "nested_expressions", "parquet"] }
+datafusion-spark = { version = "49.0.0" }
 datafusion-comet-spark-expr = { path = "spark-expr" }
 datafusion-comet-proto = { path = "proto" }
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
@@ -49,7 +49,7 @@ num = "0.4"
 rand = "0.9"
 regex = "1.9.6"
 thiserror = "2"
-object_store = { version = "0.12.0", features = ["gcp", "azure", "aws", "http"] }
+object_store = { version = "0.12.3", features = ["gcp", "azure", "aws", "http"] }
 url = "2.2"
 aws-config = "1.6.3"
 aws-credential-types = "1.2.3"

diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
@@ -78,13 +78,13 @@ datafusion-comet-objectstore-hdfs = { path = "../hdfs", optional = true, default
 procfs = "0.17.0"
 
 [dev-dependencies]
-pprof = { version = "0.14.0", features = ["flamegraph"] }
-criterion = { version = "0.5.1", features = ["async_tokio"] }
+pprof = { version = "0.15", features = ["flamegraph"] }
+criterion = { version = "0.7", features = ["async", "async_tokio", "async_std"] }
 jni = { version = "0.21", features = ["invocation"] }
 lazy_static = "1.4"
 assertables = "9"
 hex = "0.4.3"
-datafusion-functions-nested = { version = "48.0.0" }
+datafusion-functions-nested = { version = "49.0.0" }
 
 [features]
 default = []

diff --git a/native/core/benches/bit_util.rs b/native/core/benches/bit_util.rs
@@ -24,7 +24,8 @@ use comet::common::bit::{
     log2, read_num_bytes_u32, read_num_bytes_u64, read_u32, read_u64, set_bits, trailing_bits,
     BitReader, BitWriter,
 };
-use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use std::hint::black_box;
 
 /// Benchmark to measure bit_util performance.
 /// To run this benchmark:

diff --git a/native/core/benches/filter.rs b/native/core/benches/filter.rs
@@ -20,7 +20,8 @@ use arrow::array::{ArrayRef, RecordBatch};
 use arrow::compute::filter_record_batch;
 use arrow::datatypes::{DataType, Field, Schema};
 use comet::execution::operators::comet_filter_record_batch;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion};
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 

diff --git a/native/core/benches/shuffle_writer.rs b/native/core/benches/shuffle_writer.rs
@@ -89,7 +89,8 @@ fn criterion_benchmark(c: &mut Criterion) {
         CometPartitioning::RangePartitioning(
             LexOrdering::new(vec![PhysicalSortExpr::new_default(
                 col("c0", batch.schema().as_ref()).unwrap(),
-            )]),
+            )])
+            .unwrap(),
             16,
             100,
         ),

diff --git a/native/core/src/execution/operators/filter.rs b/native/core/src/execution/operators/filter.rs
@@ -211,22 +211,16 @@ impl FilterExec {
             if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() {
                 if binary.op() == &Operator::Eq {
                     // Filter evaluates to single value for all partitions
-                    if input_eqs.is_expr_constant(binary.left()) {
-                        let (expr, across_parts) = (
-                            binary.right(),
-                            input_eqs.get_expr_constant_value(binary.right()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr)).with_across_partitions(across_parts),
-                        );
-                    } else if input_eqs.is_expr_constant(binary.right()) {
-                        let (expr, across_parts) = (
-                            binary.left(),
-                            input_eqs.get_expr_constant_value(binary.left()),
-                        );
-                        res_constants.push(
-                            ConstExpr::new(Arc::clone(expr)).with_across_partitions(across_parts),
-                        );
+                    if input_eqs.is_expr_constant(binary.left()).is_some() {
+                        let across = input_eqs
+                            .is_expr_constant(binary.right())
+                            .unwrap_or_default();
+                        res_constants.push(ConstExpr::new(Arc::clone(binary.right()), across));
+                    } else if input_eqs.is_expr_constant(binary.right()).is_some() {
+                        let across = input_eqs
+                            .is_expr_constant(binary.left())
+                            .unwrap_or_default();
+                        res_constants.push(ConstExpr::new(Arc::clone(binary.left()), across));
                     }
                 }
             }
@@ -246,7 +240,7 @@ impl FilterExec {
         let mut eq_properties = input.equivalence_properties().clone();
         let (equal_pairs, _) = collect_columns_from_predicate(predicate);
         for (lhs, rhs) in equal_pairs {
-            eq_properties.add_equal_conditions(lhs, rhs)?
+            eq_properties.add_equal_conditions(Arc::clone(lhs), Arc::clone(rhs))?
         }
         // Add the columns that have only one viable value (singleton) after
         // filtering to constants.
@@ -258,14 +252,13 @@ impl FilterExec {
                     .min_value
                     .get_value();
                 let expr = Arc::new(column) as _;
-                ConstExpr::new(expr)
-                    .with_across_partitions(AcrossPartitions::Uniform(value.cloned()))
+                ConstExpr::new(expr, AcrossPartitions::Uniform(value.cloned()))
             });
         // This is for statistics
-        eq_properties = eq_properties.with_constants(constants);
+        eq_properties.add_constants(constants)?;
         // This is for logical constant (for example: a = '1', then a could be marked as a constant)
-        // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0)
-        eq_properties = eq_properties.with_constants(Self::extend_constants(input, predicate));
+        // to do: how to deal with a multiple situation to represent = (for example, c1 between 0 and 0)
+        eq_properties.add_constants(Self::extend_constants(input, predicate))?;
 
         let mut output_partitioning = input.output_partitioning().clone();
         // If contains projection, update the PlanProperties.

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -72,7 +72,7 @@ use crate::parquet::parquet_support::prepare_object_store_with_configs;
 use datafusion::common::scalar::ScalarStructBuilder;
 use datafusion::common::{
     tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter},
-    JoinType as DFJoinType, ScalarValue,
+    JoinType as DFJoinType, NullEquality, ScalarValue,
 };
 use datafusion::datasource::listing::PartitionedFile;
 use datafusion::logical_expr::type_coercion::other::get_coerce_type_for_case_expression;
@@ -594,6 +594,14 @@ impl PhysicalPlanner {
                         true,
                         false,
                     ))),
+                    // DataFusion 49 hardcodes return type for MD5 built in function as UTF8View
+                    // which is not yet supported in Comet
+                    // Converting forcibly to UTF8. To be removed after UTF8View supported
+                    "md5" => Ok(Arc::new(Cast::new(
+                        func?,
+                        DataType::Utf8,
+                        SparkCastOptions::new_without_timezone(EvalMode::Try, true),
+                    ))),
                     _ => func,
                 }
             }
@@ -1153,7 +1161,7 @@ impl PhysicalPlanner {
                 let child_copied = Self::wrap_in_copy_exec(Arc::clone(&child.native_plan));
 
                 let sort = Arc::new(
-                    SortExec::new(LexOrdering::new(exprs?), Arc::clone(&child_copied))
+                    SortExec::new(LexOrdering::new(exprs?).unwrap(), Arc::clone(&child_copied))
                         .with_fetch(fetch),
                 );
 
@@ -1429,7 +1437,7 @@ impl PhysicalPlanner {
                     sort_options,
                     // null doesn't equal to null in Spark join key. If the join key is
                     // `EqualNullSafe`, Spark will rewrite it during planning.
-                    false,
+                    NullEquality::NullEqualsNothing,
                 )?);
 
                 if join.filter.is_some() {
@@ -1497,7 +1505,7 @@ impl PhysicalPlanner {
                     PartitionMode::Partitioned,
                     // null doesn't equal to null in Spark join key. If the join key is
                     // `EqualNullSafe`, Spark will rewrite it during planning.
-                    false,
+                    NullEquality::NullEqualsNothing,
                 )?);
 
                 // If the hash join is build right, we need to swap the left and right
@@ -2193,13 +2201,15 @@ impl PhysicalPlanner {
         };
 
         let window_frame = WindowFrame::new_bounds(units, lower_bound, upper_bound);
+        let lex_orderings = LexOrdering::new(sort_exprs.to_vec());
+        let sort_phy_exprs = lex_orderings.as_deref().unwrap_or(&[]);
 
         datafusion::physical_plan::windows::create_window_expr(
             &window_func,
             window_func_name,
             &window_args,
             partition_by,
-            &LexOrdering::new(sort_exprs.to_vec()),
+            sort_phy_exprs,
             window_frame.into(),
             input_schema.as_ref(),
             false, // TODO: Ignore nulls
@@ -2280,7 +2290,7 @@ impl PhysicalPlanner {
                     .iter()
                     .map(|expr| self.create_sort_expr(expr, Arc::clone(&input_schema)))
                     .collect();
-                let lex_ordering = LexOrdering::from(exprs?);
+                let lex_ordering = LexOrdering::new(exprs?).unwrap();
                 Ok(CometPartitioning::RangePartitioning(
                     lex_ordering,
                     range_partition.num_partitions as usize,

diff --git a/native/core/src/execution/shuffle/comet_partitioning.rs b/native/core/src/execution/shuffle/comet_partitioning.rs
@@ -24,7 +24,7 @@ pub enum CometPartitioning {
     /// Allocate rows based on a hash of one of more expressions and the specified number of
     /// partitions
     Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
-    /// Allocate rows based on lexical order of one of more expressions and the specified number of
+    /// Allocate rows based on the lexical order of one of more expressions and the specified number of
     /// partitions
     RangePartitioning(LexOrdering, usize, usize),
 }

diff --git a/native/core/src/execution/shuffle/range_partitioner.rs b/native/core/src/execution/shuffle/range_partitioner.rs
@@ -247,7 +247,7 @@ mod test {
 
         let (rows, row_converter) = RangePartitioner::generate_bounds(
             input_batch.columns().to_vec().as_ref(),
-            &lex_ordering,
+            &lex_ordering.unwrap(),
             10,
             input_batch.num_rows(),
             1000,

diff --git a/native/core/src/execution/shuffle/shuffle_writer.rs b/native/core/src/execution/shuffle/shuffle_writer.rs
@@ -934,7 +934,7 @@ impl SinglePartitionShufflePartitioner {
                     Ok(Some(concatenated))
                 }
                 Err(e) => Err(DataFusionError::ArrowError(
-                    e,
+                    Box::from(e),
                     Some(DataFusionError::get_back_trace()),
                 )),
             }
@@ -1122,7 +1122,7 @@ impl Iterator for PartitionedBatchIterator<'_> {
                 Some(Ok(batch))
             }
             Err(e) => Some(Err(DataFusionError::ArrowError(
-                e,
+                Box::from(e),
                 Some(DataFusionError::get_back_trace()),
             ))),
         }
@@ -1409,7 +1409,8 @@ mod test {
             CometPartitioning::RangePartitioning(
                 LexOrdering::new(vec![PhysicalSortExpr::new_default(
                     col("a", batch.schema().as_ref()).unwrap(),
-                )]),
+                )])
+                .unwrap(),
                 num_partitions,
                 100,
             ),

diff --git a/native/hdfs/src/object_store/hdfs.rs b/native/hdfs/src/object_store/hdfs.rs
@@ -32,7 +32,7 @@ use hdfs::walkdir::HdfsWalkDir;
 use object_store::{
     path::{self, Path},
     Error, GetOptions, GetRange, GetResult, GetResultPayload, ListResult, MultipartUpload,
-    ObjectMeta, ObjectStore, PutMultipartOpts, PutOptions, PutPayload, PutResult, Result,
+    ObjectMeta, ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, Result,
 };
 
 /// scheme for HDFS File System
@@ -139,7 +139,7 @@ impl ObjectStore for HadoopFileSystem {
     async fn put_multipart_opts(
         &self,
         _location: &Path,
-        _opts: PutMultipartOpts,
+        _opts: PutMultipartOptions,
     ) -> object_store::Result<Box<dyn MultipartUpload>> {
         unimplemented!()
     }

diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
@@ -40,11 +40,10 @@ rand = { workspace = true }
 
 [dev-dependencies]
 arrow = {workspace = true}
-criterion = "0.5.1"
+criterion = { version = "0.7", features = ["async", "async_tokio", "async_std"] }
 rand = { workspace = true}
 tokio = { version = "1", features = ["rt-multi-thread"] }
 
-
 [lib]
 name = "datafusion_comet_spark_expr"
 path = "src/lib.rs"

diff --git a/native/spark-expr/benches/aggregate.rs b/native/spark-expr/benches/aggregate.rs
@@ -19,7 +19,7 @@ use arrow::array::builder::{Decimal128Builder, StringBuilder};
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion};
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
 use datafusion::execution::TaskContext;
@@ -34,6 +34,7 @@ use datafusion::physical_plan::ExecutionPlan;
 use datafusion_comet_spark_expr::AvgDecimal;
 use datafusion_comet_spark_expr::SumDecimal;
 use futures::StreamExt;
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::runtime::Runtime;

diff --git a/native/spark-expr/benches/bloom_filter_agg.rs b/native/spark-expr/benches/bloom_filter_agg.rs
@@ -19,7 +19,7 @@ use arrow::array::builder::Int64Builder;
 use arrow::array::{ArrayRef, RecordBatch};
 use arrow::datatypes::SchemaRef;
 use arrow::datatypes::{DataType, Field, Schema};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion};
 use datafusion::common::ScalarValue;
 use datafusion::datasource::memory::MemorySourceConfig;
 use datafusion::datasource::source::DataSourceExec;
@@ -30,13 +30,13 @@ use datafusion::physical_expr::expressions::{Column, Literal};
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy};
 use datafusion::physical_plan::ExecutionPlan;
+use datafusion_comet_spark_expr::BloomFilterAgg;
 use futures::StreamExt;
+use std::hint::black_box;
 use std::sync::Arc;
 use std::time::Duration;
 use tokio::runtime::Runtime;
 
-use datafusion_comet_spark_expr::BloomFilterAgg;
-
 fn criterion_benchmark(c: &mut Criterion) {
     let mut group = c.benchmark_group("bloom_filter_agg");
     let num_rows = 8192;

diff --git a/native/spark-expr/benches/conditional.rs b/native/spark-expr/benches/conditional.rs
@@ -19,14 +19,15 @@ use arrow::array::builder::{Int32Builder, StringBuilder};
 use arrow::datatypes::DataType;
 use arrow::datatypes::{Field, Schema};
 use arrow::record_batch::RecordBatch;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion};
 use datafusion::common::ScalarValue;
 use datafusion::logical_expr::Operator;
 use datafusion::physical_expr::expressions::Column;
 use datafusion::physical_expr::expressions::Literal;
 use datafusion::physical_expr::expressions::{BinaryExpr, CaseExpr};
 use datafusion::physical_expr::PhysicalExpr;
 use datafusion_comet_spark_expr::IfExpr;
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn make_col(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {

diff --git a/native/spark-expr/benches/decimal_div.rs b/native/spark-expr/benches/decimal_div.rs
@@ -18,9 +18,10 @@
 use arrow::array::builder::Decimal128Builder;
 use arrow::compute::cast;
 use arrow::datatypes::DataType;
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, Criterion};
 use datafusion::physical_plan::ColumnarValue;
 use datafusion_comet_spark_expr::{spark_decimal_div, spark_decimal_integral_div};
+use std::hint::black_box;
 use std::sync::Arc;
 
 fn criterion_benchmark(c: &mut Criterion) {

diff --git a/native/spark-expr/src/conversion_funcs/cast.rs b/native/spark-expr/src/conversion_funcs/cast.rs
@@ -960,6 +960,7 @@ fn cast_array(
         {
             spark_cast_nonintegral_numeric_to_integral(&array, eval_mode, from_type, to_type)
         }
+        (Utf8View, Utf8) => Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?),
         (Struct(_), Utf8) => Ok(casts_struct_to_string(array.as_struct(), cast_options)?),
         (Struct(_), Struct(_)) => Ok(cast_struct_to_struct(
             array.as_struct(),