Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.execution.adaptive

import org.apache.spark.sql.catalyst.expressions.SortOrder
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution, HashPartitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.{CollectMetricsExec, FilterExec, ProjectExec, SortExec, SparkPlan}
import org.apache.spark.sql.execution.exchange.{REPARTITION_BY_COL, REPARTITION_BY_NUM, ShuffleExchangeExec}
Expand Down Expand Up @@ -57,4 +58,26 @@ object AQEUtils {
}
case _ => Some(UnspecifiedDistribution)
}

// Analyze the given plan and calculate the required ordering of this plan w.r.t. the
// user-specified sort.
def getRequiredOrdering(p: SparkPlan): Seq[SortOrder] = p match {
// User-specified sort is only effective when it's the root node, or under
// Project/Filter/CollectMetrics.
case f: FilterExec => getRequiredOrdering(f.child)
case c: CollectMetricsExec => getRequiredOrdering(c.child)
// We do not need to care whether the sort is global or not, since the output partitioning
// is ensured by requiredDistribution.
case s: SortExec => s.outputOrdering
case p: ProjectExec =>
val requiredOrdering = getRequiredOrdering(p.child)
// avoid case `df.sort(a, b).select(c)`
if (requiredOrdering.map(_.child).forall(e => p.projectList.exists(_.semanticEquals(e)))) {
requiredOrdering
} else {
Nil
}

case _ => Nil
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer}
import org.apache.spark.sql.catalyst.plans.physical.{Distribution, UnspecifiedDistribution}
import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule}
Expand Down Expand Up @@ -97,6 +97,13 @@ case class AdaptiveSparkPlanExec(
AQEUtils.getRequiredDistribution(inputPlan)
}

// Make sure AQE does not change the user-specified output ordering
@transient private val requiredOrdering: Seq[SortOrder] = if (isSubquery) {
Nil
} else {
AQEUtils.getRequiredOrdering(inputPlan)
}

@transient private val costEvaluator =
conf.getConf(SQLConf.ADAPTIVE_CUSTOM_COST_EVALUATOR_CLASS) match {
case Some(className) => CostEvaluator.instantiate(className, session.sparkContext.getConf)
Expand All @@ -112,7 +119,7 @@ case class AdaptiveSparkPlanExec(
// `EnsureRequirements` to not optimize out the user-specified repartition-by-col to work
// around this case.
val ensureRequirements =
EnsureRequirements(requiredDistribution.isDefined, requiredDistribution)
EnsureRequirements(requiredDistribution.isDefined, requiredDistribution, requiredOrdering)
Seq(
RemoveRedundantProjects,
ensureRequirements,
Expand Down Expand Up @@ -163,7 +170,7 @@ case class AdaptiveSparkPlanExec(
} else {
UnspecifiedDistribution
}
if (ValidateRequirements.validate(applied, distribution)) {
if (ValidateRequirements.validate(applied, distribution, requiredOrdering)) {
applied
} else {
logDebug(s"Rule ${rule.ruleName} is not applied as it breaks the " +
Expand Down Expand Up @@ -207,6 +214,8 @@ case class AdaptiveSparkPlanExec(

override def output: Seq[Attribute] = inputPlan.output

override def outputOrdering: Seq[SortOrder] = requiredOrdering
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to override outputPartitioning as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can, but there is no requirements about outputPartitioning


override def doCanonicalize(): SparkPlan = inputPlan.canonicalized

override def resetMetrics(): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,13 @@ import org.apache.spark.sql.execution.joins.{ShuffledHashJoinExec, SortMergeJoin
* repartition shuffles in the plan.
* @param requiredDistribution The root required distribution we should ensure. This value is used
* in AQE in case we change final stage output partitioning.
* @param requiredOrdering The root required ordering we should ensure. This value is used
* in AQE in case we change final stage output ordering.
*/
case class EnsureRequirements(
optimizeOutRepartition: Boolean = true,
requiredDistribution: Option[Distribution] = None)
requiredDistribution: Option[Distribution] = None,
requiredOrdering: Seq[SortOrder] = Nil)
extends Rule[SparkPlan] {

private def ensureDistributionAndOrdering(
Expand Down Expand Up @@ -333,10 +336,18 @@ case class EnsureRequirements(
val finalPlan = ensureDistributionAndOrdering(
newPlan :: Nil,
requiredDistribution.get :: Nil,
Seq(Nil),
Seq(requiredOrdering),
shuffleOrigin)
assert(finalPlan.size == 1)
finalPlan.head
} else if (requiredOrdering.nonEmpty) {
val finalPlan = ensureDistributionAndOrdering(
newPlan :: Nil,
Nil,
Seq(requiredOrdering),
ENSURE_REQUIREMENTS)
assert(finalPlan.size == 1)
finalPlan.head
} else {
newPlan
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ import org.apache.spark.sql.execution._
*/
object ValidateRequirements extends Logging {

def validate(
plan: SparkPlan,
requiredDistribution: Distribution,
requiredOrdering: Seq[SortOrder]): Boolean = {
validate(plan, requiredDistribution) &&
SortOrder.orderingSatisfies(plan.outputOrdering, requiredOrdering)
}

def validate(plan: SparkPlan, requiredDistribution: Distribution): Boolean = {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems we can remove this now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OptimizeSkewedJoin still use this. I considered unify them, but seems OptimizeSkewedJoin does not affect the required output ordering.

validate(plan) && plan.outputPartitioning.satisfies(requiredDistribution)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1468,8 +1468,8 @@ class AdaptiveQueryExecSuite
val (plan2, adaptivePlan2) = runAdaptiveAndVerifyResult(
"SELECT key FROM (SELECT * FROM testData WHERE value = 'no_match' ORDER BY key)" +
" WHERE key > rand()")
assert(findTopLevelSort(plan2).size == 1)
assert(stripAQEPlan(adaptivePlan2).isInstanceOf[LocalTableScanExec])
assert(findTopLevelSort(plan2).head.global)
assert(!findTopLevelSort(adaptivePlan2).head.global)
}
}

Expand Down Expand Up @@ -2535,13 +2535,13 @@ class AdaptiveQueryExecSuite
(1 to 4).map(i => TestData(i, i.toString)), 2)
.toDF("c1", "c2").createOrReplaceTempView("v")

// remove sort
// remove global sort
val (origin1, adaptive1) = runAdaptiveAndVerifyResult(
"""
|SELECT * FROM v where c1 = 1 order by c1, c2
|""".stripMargin)
assert(findTopLevelSort(origin1).size == 1)
assert(findTopLevelSort(adaptive1).isEmpty)
assert(findTopLevelSort(origin1).head.global)
assert(!findTopLevelSort(adaptive1).head.global)

// convert group only aggregate to project
val (origin2, adaptive2) = runAdaptiveAndVerifyResult(
Expand Down Expand Up @@ -2575,6 +2575,24 @@ class AdaptiveQueryExecSuite
assert(findTopLevelAggregate(adaptive5).size == 4)
}
}

test("SPARK-38578: AdaptiveSparkPlanExec should ensure user-specified ordering") {
Seq(
("key", "key, value", false),
("key as x", "key, value", false),
("key", "key", true),
("key, value", "key", true)
).foreach { case (project, sort, required) =>
// During re-optimize in AQE, the sort will be converted to local relation if it's empty
// So this test ensure we will add sort back if it is User-specified
val (origin, adaptive) = runAdaptiveAndVerifyResult(
s"""
|SELECT $project FROM testdata where key < 0 ORDER BY $sort
|""".stripMargin)
assert(findTopLevelSort(origin).size == 1)
assert(findTopLevelSort(adaptive).nonEmpty == required)
}
}
}

/**
Expand Down