Skip to content

Commit af8e65f

Browse files
aokolnychyidongjoon-hyun
authored andcommitted
[SPARK-32276][SQL] Remove redundant sorts before repartition nodes
### What changes were proposed in this pull request? This PR removes redundant sorts before repartition nodes with shuffles and repartitionByExpression with deterministic expressions. ### Why are the changes needed? It looks like our `EliminateSorts` rule can be extended further to remove sorts before repartition nodes that shuffle data as such repartition operations change the ordering and distribution of data. That's why it seems safe to perform the following rewrites: - `Repartition -> Sort -> Scan` as `Repartition -> Scan` - `Repartition -> Project -> Sort -> Scan` as `Repartition -> Project -> Scan` We don't apply this optimization to coalesce as it uses `DefaultPartitionCoalescer` that may preserve the ordering of data if there is no locality info in the parent RDD. At the same time, there is no guarantee that will happen. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? More test cases. Closes #29089 from aokolnychyi/spark-32276. Authored-by: Anton Okolnychyi <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 6bdd710 commit af8e65f

File tree

2 files changed

+187
-0
lines changed

2 files changed

+187
-0
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,10 @@ object CombineFilters extends Rule[LogicalPlan] with PredicateHelper {
974974
* and the Join conditions is deterministic
975975
* 5) if the Sort operator is within GroupBy separated by 0...n Project/Filter operators only,
976976
* and the aggregate function is order irrelevant
977+
* 6) if the Sort operator is within RepartitionByExpression separated by 0...n Project/Filter
978+
* operators and the repartition expressions are deterministic
979+
* 7) if the Sort operator is within Repartition separated by 0...n Project/Filter operators
980+
* and the repartition requires a shuffle
977981
*/
978982
object EliminateSorts extends Rule[LogicalPlan] {
979983
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -987,6 +991,10 @@ object EliminateSorts extends Rule[LogicalPlan] {
987991
j.copy(left = recursiveRemoveSort(originLeft), right = recursiveRemoveSort(originRight))
988992
case g @ Aggregate(_, aggs, originChild) if isOrderIrrelevantAggs(aggs) =>
989993
g.copy(child = recursiveRemoveSort(originChild))
994+
case r: RepartitionByExpression if r.partitionExpressions.forall(_.deterministic) =>
995+
r.copy(child = recursiveRemoveSort(r.child))
996+
case r: Repartition if r.shuffle =>
997+
r.copy(child = recursiveRemoveSort(r.child))
990998
}
991999

9921000
private def recursiveRemoveSort(plan: LogicalPlan): LogicalPlan = plan match {
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.optimizer
19+
20+
import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
21+
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
22+
import org.apache.spark.sql.catalyst.dsl.expressions._
23+
import org.apache.spark.sql.catalyst.dsl.plans._
24+
import org.apache.spark.sql.catalyst.plans.PlanTest
25+
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
26+
import org.apache.spark.sql.catalyst.rules.RuleExecutor
27+
28+
class EliminateSortsBeforeRepartitionSuite extends PlanTest {
29+
30+
val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
31+
val analyzer = new Analyzer(catalog, conf)
32+
val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
33+
34+
object Optimize extends RuleExecutor[LogicalPlan] {
35+
val batches =
36+
Batch("Default", FixedPoint(10),
37+
FoldablePropagation,
38+
LimitPushDown) ::
39+
Batch("Eliminate Sorts", Once,
40+
EliminateSorts) ::
41+
Batch("Collapse Project", Once,
42+
CollapseProject) :: Nil
43+
}
44+
45+
def repartition(plan: LogicalPlan): LogicalPlan = plan.repartition(10)
46+
def isOptimized: Boolean = true
47+
48+
test("sortBy") {
49+
val plan = testRelation.select('a, 'b).sortBy('a.asc, 'b.desc)
50+
val planWithRepartition = repartition(plan)
51+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
52+
val correctPlan = if (isOptimized) {
53+
repartition(testRelation.select('a, 'b))
54+
} else {
55+
planWithRepartition
56+
}
57+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
58+
}
59+
60+
test("sortBy with projection") {
61+
val plan = testRelation.select('a, 'b)
62+
.sortBy('a.asc, 'b.asc)
63+
.select('a + 1 as "a", 'b + 2 as "b")
64+
val planWithRepartition = repartition(plan)
65+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
66+
val correctPlan = if (isOptimized) {
67+
repartition(testRelation.select('a + 1 as "a", 'b + 2 as "b"))
68+
} else {
69+
planWithRepartition
70+
}
71+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
72+
}
73+
74+
test("sortBy with projection and filter") {
75+
val plan = testRelation.sortBy('a.asc, 'b.asc)
76+
.select('a, 'b)
77+
.where('a === 10)
78+
val planWithRepartition = repartition(plan)
79+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
80+
val correctPlan = if (isOptimized) {
81+
repartition(testRelation.select('a, 'b).where('a === 10))
82+
} else {
83+
planWithRepartition
84+
}
85+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
86+
}
87+
88+
test("sortBy with limit") {
89+
val plan = testRelation.sortBy('a.asc, 'b.asc).limit(10)
90+
val planWithRepartition = repartition(plan)
91+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
92+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
93+
}
94+
95+
test("sortBy with non-deterministic projection") {
96+
val plan = testRelation.sortBy('a.asc, 'b.asc).select(rand(1), 'a, 'b)
97+
val planWithRepartition = repartition(plan)
98+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
99+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
100+
}
101+
102+
test("orderBy") {
103+
val plan = testRelation.select('a, 'b).orderBy('a.asc, 'b.asc)
104+
val planWithRepartition = repartition(plan)
105+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
106+
val correctPlan = if (isOptimized) {
107+
repartition(testRelation.select('a, 'b))
108+
} else {
109+
planWithRepartition
110+
}
111+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
112+
}
113+
114+
test("orderBy with projection") {
115+
val plan = testRelation.select('a, 'b)
116+
.orderBy('a.asc, 'b.asc)
117+
.select('a + 1 as "a", 'b + 2 as "b")
118+
val planWithRepartition = repartition(plan)
119+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
120+
val correctPlan = if (isOptimized) {
121+
repartition(testRelation.select('a + 1 as "a", 'b + 2 as "b"))
122+
} else {
123+
planWithRepartition
124+
}
125+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
126+
}
127+
128+
test("orderBy with projection and filter") {
129+
val plan = testRelation.orderBy('a.asc, 'b.asc)
130+
.select('a, 'b)
131+
.where('a === 10)
132+
val planWithRepartition = repartition(plan)
133+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
134+
val correctPlan = if (isOptimized) {
135+
repartition(testRelation.select('a, 'b).where('a === 10))
136+
} else {
137+
planWithRepartition
138+
}
139+
comparePlans(optimizedPlan, analyzer.execute(correctPlan))
140+
}
141+
142+
test("orderBy with limit") {
143+
val plan = testRelation.orderBy('a.asc, 'b.asc).limit(10)
144+
val planWithRepartition = repartition(plan)
145+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
146+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
147+
}
148+
149+
test("orderBy with non-deterministic projection") {
150+
val plan = testRelation.orderBy('a.asc, 'b.asc).select(rand(1), 'a, 'b)
151+
val planWithRepartition = repartition(plan)
152+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
153+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
154+
}
155+
}
156+
157+
class EliminateSortsBeforeRepartitionByExprsSuite extends EliminateSortsBeforeRepartitionSuite {
158+
override def repartition(plan: LogicalPlan): LogicalPlan = plan.distribute('a, 'b)(10)
159+
override def isOptimized: Boolean = true
160+
161+
test("sortBy before repartition with non-deterministic expressions") {
162+
val plan = testRelation.sortBy('a.asc, 'b.asc).limit(10)
163+
val planWithRepartition = plan.distribute(rand(1).asc, 'a.asc)(20)
164+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
165+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
166+
}
167+
168+
test("orderBy before repartition with non-deterministic expressions") {
169+
val plan = testRelation.orderBy('a.asc, 'b.asc).limit(10)
170+
val planWithRepartition = plan.distribute(rand(1).asc, 'a.asc)(20)
171+
val optimizedPlan = Optimize.execute(analyzer.execute(planWithRepartition))
172+
comparePlans(optimizedPlan, analyzer.execute(planWithRepartition))
173+
}
174+
}
175+
176+
class EliminateSortsBeforeCoalesceSuite extends EliminateSortsBeforeRepartitionSuite {
177+
override def repartition(plan: LogicalPlan): LogicalPlan = plan.coalesce(1)
178+
override def isOptimized: Boolean = false
179+
}

0 commit comments

Comments
 (0)