Skip to content

Commit aa333d1

Browse files
committed
Merge remote-tracking branch 'origin/main' into refactor-treenode-apply
# Conflicts: # datafusion/expr/src/expr.rs # datafusion/expr/src/utils.rs
2 parents 8882285 + 8d72196 commit aa333d1

File tree

13 files changed

+1172
-494
lines changed

13 files changed

+1172
-494
lines changed

datafusion/common/src/dfschema.rs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,22 @@ impl DFSchema {
347347
.collect()
348348
}
349349

350+
/// Find all fields indices having the given qualifier
351+
pub fn fields_indices_with_qualified(
352+
&self,
353+
qualifier: &TableReference,
354+
) -> Vec<usize> {
355+
self.fields
356+
.iter()
357+
.enumerate()
358+
.filter_map(|(idx, field)| {
359+
field
360+
.qualifier()
361+
.and_then(|q| q.eq(qualifier).then_some(idx))
362+
})
363+
.collect()
364+
}
365+
350366
/// Find all fields match the given name
351367
pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&DFField> {
352368
self.fields

datafusion/common/src/tree_node.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,11 @@ pub trait TreeNode: Sized {
167167
// Run the recursive `transform` on each children.
168168
let mut payload_up = vec![];
169169
let tnr = self.transform_children(&mut |c| {
170-
let (tnr, p) =
171-
c.transform_with_payload(f_down, new_payload_down_iter.next().unwrap(), f_up)?;
170+
let (tnr, p) = c.transform_with_payload(
171+
f_down,
172+
new_payload_down_iter.next().unwrap(),
173+
f_up,
174+
)?;
172175
p.into_iter().for_each(|p| payload_up.push(p));
173176
Ok(tnr)
174177
})?;

datafusion/core/src/datasource/stream.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ impl TableProviderFactory for StreamTableFactory {
6464
.with_encoding(encoding)
6565
.with_order(cmd.order_exprs.clone())
6666
.with_header(cmd.has_header)
67-
.with_batch_size(state.config().batch_size());
67+
.with_batch_size(state.config().batch_size())
68+
.with_constraints(cmd.constraints.clone());
6869

6970
Ok(Arc::new(StreamTable(Arc::new(config))))
7071
}

datafusion/core/src/physical_optimizer/enforce_sorting.rs

Lines changed: 61 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ use datafusion_common::tree_node::{
6262
};
6363
use datafusion_common::{plan_err, DataFusionError};
6464
use datafusion_physical_expr::{PhysicalSortExpr, PhysicalSortRequirement};
65-
6665
use datafusion_physical_plan::repartition::RepartitionExec;
66+
6767
use itertools::izip;
6868

6969
/// This rule inspects [`SortExec`]'s in the given physical plan and removes the
@@ -842,14 +842,16 @@ mod tests {
842842
use crate::physical_plan::repartition::RepartitionExec;
843843
use crate::physical_plan::{displayable, get_plan_string, Partitioning};
844844
use crate::prelude::{SessionConfig, SessionContext};
845-
use crate::test::{csv_exec_sorted, stream_exec_ordered};
845+
use crate::test::{csv_exec_ordered, csv_exec_sorted, stream_exec_ordered};
846846

847847
use arrow::compute::SortOptions;
848848
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
849849
use datafusion_common::Result;
850850
use datafusion_expr::JoinType;
851851
use datafusion_physical_expr::expressions::{col, Column, NotExpr};
852852

853+
use rstest::rstest;
854+
853855
fn create_test_schema() -> Result<SchemaRef> {
854856
let nullable_column = Field::new("nullable_col", DataType::Int32, true);
855857
let non_nullable_column = Field::new("non_nullable_col", DataType::Int32, false);
@@ -2213,12 +2215,19 @@ mod tests {
22132215
Ok(())
22142216
}
22152217

2218+
#[rstest]
22162219
#[tokio::test]
2217-
async fn test_with_lost_ordering_unbounded() -> Result<()> {
2220+
async fn test_with_lost_ordering_unbounded_bounded(
2221+
#[values(false, true)] source_unbounded: bool,
2222+
) -> Result<()> {
22182223
let schema = create_test_schema3()?;
22192224
let sort_exprs = vec![sort_expr("a", &schema)];
2220-
// create an unbounded source
2221-
let source = stream_exec_ordered(&schema, sort_exprs);
2225+
// create either bounded or unbounded source
2226+
let source = if source_unbounded {
2227+
stream_exec_ordered(&schema, sort_exprs)
2228+
} else {
2229+
csv_exec_ordered(&schema, sort_exprs)
2230+
};
22222231
let repartition_rr = repartition_exec(source);
22232232
let repartition_hash = Arc::new(RepartitionExec::try_new(
22242233
repartition_rr,
@@ -2227,50 +2236,71 @@ mod tests {
22272236
let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
22282237
let physical_plan = sort_exec(vec![sort_expr("a", &schema)], coalesce_partitions);
22292238

2230-
let expected_input = [
2239+
// Expected inputs unbounded and bounded
2240+
let expected_input_unbounded = vec![
22312241
"SortExec: expr=[a@0 ASC]",
22322242
" CoalescePartitionsExec",
22332243
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
22342244
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
22352245
" StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
22362246
];
2237-
let expected_optimized = [
2247+
let expected_input_bounded = vec![
2248+
"SortExec: expr=[a@0 ASC]",
2249+
" CoalescePartitionsExec",
2250+
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
2251+
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
2252+
" CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], has_header=true",
2253+
];
2254+
2255+
// Expected unbounded result (same for with and without flag)
2256+
let expected_optimized_unbounded = vec![
22382257
"SortPreservingMergeExec: [a@0 ASC]",
22392258
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
22402259
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
22412260
" StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
22422261
];
2243-
assert_optimized!(expected_input, expected_optimized, physical_plan, true);
2244-
Ok(())
2245-
}
22462262

2247-
#[tokio::test]
2248-
async fn test_with_lost_ordering_unbounded_parallelize_off() -> Result<()> {
2249-
let schema = create_test_schema3()?;
2250-
let sort_exprs = vec![sort_expr("a", &schema)];
2251-
// create an unbounded source
2252-
let source = stream_exec_ordered(&schema, sort_exprs);
2253-
let repartition_rr = repartition_exec(source);
2254-
let repartition_hash = Arc::new(RepartitionExec::try_new(
2255-
repartition_rr,
2256-
Partitioning::Hash(vec![col("c", &schema).unwrap()], 10),
2257-
)?) as _;
2258-
let coalesce_partitions = coalesce_partitions_exec(repartition_hash);
2259-
let physical_plan = sort_exec(vec![sort_expr("a", &schema)], coalesce_partitions);
2260-
2261-
let expected_input = ["SortExec: expr=[a@0 ASC]",
2263+
// Expected bounded results with and without flag
2264+
let expected_optimized_bounded = vec![
2265+
"SortExec: expr=[a@0 ASC]",
22622266
" CoalescePartitionsExec",
22632267
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
22642268
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
2265-
" StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
2269+
" CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], has_header=true",
22662270
];
2267-
let expected_optimized = [
2271+
let expected_optimized_bounded_parallelize_sort = vec![
22682272
"SortPreservingMergeExec: [a@0 ASC]",
2269-
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10, preserve_order=true, sort_exprs=a@0 ASC",
2270-
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
2271-
" StreamingTableExec: partition_sizes=1, projection=[a, b, c, d, e], infinite_source=true, output_ordering=[a@0 ASC]",
2273+
" SortExec: expr=[a@0 ASC]",
2274+
" RepartitionExec: partitioning=Hash([c@2], 10), input_partitions=10",
2275+
" RepartitionExec: partitioning=RoundRobinBatch(10), input_partitions=1",
2276+
" CsvExec: file_groups={1 group: [[file_path]]}, projection=[a, b, c, d, e], output_ordering=[a@0 ASC], has_header=true",
22722277
];
2273-
assert_optimized!(expected_input, expected_optimized, physical_plan, false);
2278+
let (expected_input, expected_optimized, expected_optimized_sort_parallelize) =
2279+
if source_unbounded {
2280+
(
2281+
expected_input_unbounded,
2282+
expected_optimized_unbounded.clone(),
2283+
expected_optimized_unbounded,
2284+
)
2285+
} else {
2286+
(
2287+
expected_input_bounded,
2288+
expected_optimized_bounded,
2289+
expected_optimized_bounded_parallelize_sort,
2290+
)
2291+
};
2292+
assert_optimized!(
2293+
expected_input,
2294+
expected_optimized,
2295+
physical_plan.clone(),
2296+
false
2297+
);
2298+
assert_optimized!(
2299+
expected_input,
2300+
expected_optimized_sort_parallelize,
2301+
physical_plan,
2302+
true
2303+
);
22742304
Ok(())
22752305
}
22762306

0 commit comments

Comments
 (0)