apache · alamb · Jul 24, 2025 · Aug 22, 2025 · berkaysynnada · Jul 25, 2025
diff --git a/datafusion/sqllogictest/test_files/partial_sorts.slt b/datafusion/sqllogictest/test_files/partial_sorts.slt
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+COPY (values
+(1, 'a'),
+(2, 'b'),
+(3, 'd'),
+(4, 'e'),
+(5, 'a')
+) TO 'test_files/scratch/partial_sorts/data.csv' STORED AS CSV;
+----
+5
+
+statement ok
+CREATE EXTERNAL TABLE data (
+    a int,
+    b string
+)
+STORED AS CSV
+LOCATION 'test_files/scratch/partial_sorts/data.csv'
+WITH ORDER (a);
+
+# Expect no sort is necessary when the data is already sorted
+query TT
+EXPLAIN SELECT * FROM data ORDER BY a;
+----
+logical_plan
+01)Sort: data.a ASC NULLS LAST
+02)--TableScan: data projection=[a, b]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/partial_sorts/data.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+
+query IT
+SELECT * FROM data ORDER BY a;
+----
+1 a
+2 b
+3 d
+4 e
+5 a
+
+# Expect we only need a PartialSort since 'a' is already sorted
+# Note it uses a Full SortExec due to https://github.com/apache/datafusion/issues/16899
+query TT
+EXPLAIN SELECT * FROM data ORDER BY a, b;
+----
+logical_plan
+01)Sort: data.a ASC NULLS LAST, data.b ASC NULLS LAST
+02)--TableScan: data projection=[a, b]
+physical_plan
+01)SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/partial_sorts/data.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+
+
+# Grouping on a column that is already sorted should not require a sort
+query TT
+EXPLAIN SELECT a, count(*) FROM data GROUP BY a ORDER BY a;
+----
+logical_plan
+01)Sort: data.a ASC NULLS LAST
+02)--Projection: data.a, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[data.a]], aggr=[[count(Int64(1))]]
+04)------TableScan: data projection=[a]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST]
+02)--ProjectionExec: expr=[a@0 as a, count(Int64(1))@1 as count(*)]
+03)----AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))], ordering_mode=Sorted
+04)------SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+05)--------CoalesceBatchesExec: target_batch_size=8192
+06)----------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4
+07)------------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))], ordering_mode=Sorted
+08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/partial_sorts/data.csv]]}, projection=[a], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+
+query II
+SELECT a, count(*) FROM data GROUP BY a ORDER BY a;
+----
+1 1
+2 1
+3 1
+4 1
+5 1
+
+# Grouping on both a, b can use a `ordering_mode=PartiallySorted` hash aggregate
+# as the data is already sorted by 'a'
+query TT
+EXPLAIN SELECT a, b, count(*) FROM data GROUP BY a, b ORDER BY a, b;
+----
+logical_plan
+01)Sort: data.a ASC NULLS LAST, data.b ASC NULLS LAST
+02)--Projection: data.a, data.b, count(Int64(1)) AS count(*)
+03)----Aggregate: groupBy=[[data.a, data.b]], aggr=[[count(Int64(1))]]
+04)------TableScan: data projection=[a, b]
+physical_plan
+01)SortPreservingMergeExec: [a@0 ASC NULLS LAST, b@1 ASC NULLS LAST]
+02)--SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[a@0 as a, b@1 as b, count(Int64(1))@2 as count(*)]
+04)------AggregateExec: mode=FinalPartitioned, gby=[a@0 as a, b@1 as b], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0])
+05)--------SortExec: expr=[a@0 ASC NULLS LAST], preserve_partitioning=[true]
+06)----------CoalesceBatchesExec: target_batch_size=8192
+07)------------RepartitionExec: partitioning=Hash([a@0, b@1], 4), input_partitions=4
+08)--------------AggregateExec: mode=Partial, gby=[a@0 as a, b@1 as b], aggr=[count(Int64(1))], ordering_mode=PartiallySorted([0])
+09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/partial_sorts/data.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST], file_type=csv, has_header=true
+
+query ITI
+SELECT a, b, count(*) FROM data GROUP BY a, b ORDER BY a, b;
+----
+1 a 1
+2 b 1
+3 d 1
+4 e 1
+5 a 1
+
+# cleanup
+statement ok
+drop table data