You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
{{ message }}
This repository was archived by the owner on Oct 23, 2024. It is now read-only.
[DCOS-46585] Fix supervised driver retry logic for outdated tasks (#46)
This commit fixes a bug where `--supervised` drivers would relaunch after receiving an outdated status update from a restarted/crashed agent even if they had already been relaunched and running elsewhere. In those scenarios, previous logic would cause two identical jobs to be running and ZK state would only have a record of the latest one effectively orphaning the 1st job.
Copy file name to clipboardExpand all lines: resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+14-1Lines changed: 14 additions & 1 deletion
Original file line number
Diff line number
Diff line change
@@ -346,7 +346,7 @@ private[spark] class MesosClusterScheduler(
346
346
this.masterInfo =Some(masterInfo)
347
347
this.schedulerDriver = driver
348
348
349
-
if (!pendingRecover.isEmpty) {
349
+
if (pendingRecover.nonEmpty) {
350
350
// Start task reconciliation if we need to recover.
351
351
valstatuses= pendingRecover.collect {
352
352
case (taskId, slaveId) =>
@@ -766,6 +766,10 @@ private[spark] class MesosClusterScheduler(
766
766
valstate= launchedDrivers(subId)
767
767
// Check if the driver is supervise enabled and can be relaunched.
768
768
if (state.driverDescription.supervise && shouldRelaunch(status.getState)) {
769
+
if (taskIsOutdated(taskId, state)) {
770
+
// Prevent outdated task from overwriting a more recent status
Copy file name to clipboardExpand all lines: resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
Copy file name to clipboardExpand all lines: resource-managers/mesos/src/test/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterSchedulerSuite.scala
+78Lines changed: 78 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -421,6 +421,84 @@ class MesosClusterSchedulerSuite extends SparkFunSuite with LocalSparkContext wi
421
421
assert(state.finishedDrivers.size ==1)
422
422
}
423
423
424
+
test("does not restart outdated supervised drivers") {
425
+
// Covers scenario where:
426
+
// - agent goes down
427
+
// - supervised job is relaunched on another agent
428
+
// - first agent re-registers and sends status update: TASK_FAILED
0 commit comments