Make the block manager decommissioning test be less flaky

dagrawal3409 · dagrawal3409 · commit 199ea03f0288 · 2020-07-25T00:06:43.000-07:00
Its possible for this test to schedule the 3 tasks on just 2 out of 3
executors and then end up decommissioning the third one. Since the third
executor does not have any blocks, none get replicated, thereby failing
the test.

This condition can be triggered on a loaded system when the Worker
registrations are sometimes delayed. It does trigger on Github checks
frequently enough to annoy me to fix it.

I added some logging to diagnose the problem. But the fix is simple:
Just require that each executor gets the full worker so that no sharing
is possible.
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.storage
 
 import java.util.concurrent.Semaphore
+import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.duration._
@@ -57,6 +58,11 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       .set(config.STORAGE_DECOMMISSION_ENABLED, true)
       .set(config.STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED, persist)
       .set(config.STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED, shuffle)
+      // Force exactly one executor per worker such that all block managers
+      // get the shuffle and RDD blocks.
+      .set(config.EXECUTOR_CORES.key, "1")
+      .set(config.CPUS_PER_TASK.key, "1")
+      .set(config.EXECUTOR_MEMORY.key, "1024m")
       // Just replicate blocks as fast as we can during testing, there isn't another
       // workload we need to worry about.
       .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 1L)
@@ -92,6 +98,8 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
     val executorRemovedSem = new Semaphore(0)
     val taskEndEvents = ArrayBuffer.empty[SparkListenerTaskEnd]
     val blocksUpdated = ArrayBuffer.empty[SparkListenerBlockUpdated]
+    val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend]
+    val execToDecommission = new AtomicReference[String](null)
     sc.addSparkListener(new SparkListener {
 
       override def onExecutorRemoved(execRemoved: SparkListenerExecutorRemoved): Unit = {
@@ -107,6 +115,21 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       }
 
       override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = {
+        if (blockUpdated.blockUpdatedInfo.blockId.isRDD && persist) {
+          // Persisted RDD blocks are a bit weirder than shuffle blocks: Even though
+          // the tasks are run say on executors (0, 1, 2), the RDD blocks might end up only
+          // on executors 0 and 1. So we cannot just indiscriminately decommission any executor.
+          // Instead we must decommission an executor that actually has an RDD block.
+          // Fortunately, this isn't the case for shuffle blocks which are indeed present on all
+          // executors and thus any executor can be decommissioned when `persist` is false.
+          val candidateExecToDecom = blockUpdated.blockUpdatedInfo.blockManagerId.executorId
+          if (execToDecommission.compareAndSet(null, candidateExecToDecom)) {
+            val decomContext = s"Decommissioning executor ${candidateExecToDecom} for persist"
+            logInfo(decomContext)
+            sched.decommissionExecutor(candidateExecToDecom,
+              ExecutorDecommissionInfo(decomContext, false))
+          }
+        }
         // Once broadcast start landing on the executors we're good to proceed.
         // We don't only use task start as it can occur before the work is on the executor.
         if (blockUpdated.blockUpdatedInfo.blockId.isBroadcast) {
@@ -139,14 +162,17 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       ThreadUtils.awaitResult(asyncCount, 15.seconds)
     }
 
-    // Decommission one of the executors.
-    val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend]
     val execs = sched.getExecutorIds()
     assert(execs.size == numExecs, s"Expected ${numExecs} executors but found ${execs.size}")
 
-    val execToDecommission = execs.head
-    logDebug(s"Decommissioning executor ${execToDecommission}")
-    sched.decommissionExecutor(execToDecommission, ExecutorDecommissionInfo("", false))
+    if (!persist && execToDecommission.compareAndSet(null, execs.head)) {
+      // But for non persisted blocks, we can decommission any executor since the shuffle blocks
+      // are indeed present on any executor.
+      val decomContext = s"Decommissioning executor ${execToDecommission.get()}"
+      logInfo(decomContext)
+      sched.decommissionExecutor(execToDecommission.get(),
+        ExecutorDecommissionInfo(decomContext, false))
+    }
 
     // Wait for job to finish.
     val asyncCountResult = ThreadUtils.awaitResult(asyncCount, 15.seconds)
@@ -206,15 +232,15 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
     val execIdToBlocksMapping = storageStatus.map(
       status => (status.blockManagerId.executorId, status.blocks)).toMap
     // No cached blocks should be present on executor which was decommissioned
-    assert(execIdToBlocksMapping(execToDecommission).keys.filter(_.isRDD).toSeq === Seq(),
+    assert(execIdToBlocksMapping(execToDecommission.get()).keys.filter(_.isRDD).toSeq === Seq(),
       "Cache blocks should be migrated")
     if (persist) {
       // There should still be all the RDD blocks cached
       assert(execIdToBlocksMapping.values.flatMap(_.keys).count(_.isRDD) === numParts)
     }
 
     // Make the executor we decommissioned exit
-    sched.client.killExecutors(List(execToDecommission))
+    sched.client.killExecutors(List(execToDecommission.get()))
 
     // Wait for the executor to be removed
     executorRemovedSem.acquire(1)