@@ -96,6 +96,15 @@ class BlockManagerMasterEndpoint(
9696 mapper
9797 }
9898
99+ private val executorTimeoutMs = Utils .executorTimeoutMs(conf)
100+ private val blockManagerInfoCleaner = {
101+ val cleaningDelay = Math .floorDiv(executorTimeoutMs, 2L )
102+ val executor = ThreadUtils .newDaemonSingleThreadScheduledExecutor(" blockManagerInfo-cleaner" )
103+ executor.scheduleWithFixedDelay(() => cleanBlockManagerInfo(), cleaningDelay, cleaningDelay,
104+ TimeUnit .MILLISECONDS )
105+ executor
106+ }
107+
99108 val proactivelyReplicate = conf.get(config.STORAGE_REPLICATION_PROACTIVE )
100109
101110 val defaultRpcTimeout = RpcUtils .askRpcTimeout(conf)
@@ -273,12 +282,12 @@ class BlockManagerMasterEndpoint(
273282 }
274283 }
275284 bmIdsExecutor.foreach { bmId =>
276- blockManagerInfo.get (bmId).foreach { bmInfo =>
285+ aliveBlockManagerInfo (bmId).foreach { bmInfo =>
277286 bmInfo.removeBlock(blockId)
278287 }
279288 }
280289 }
281- val removeRddFromExecutorsFutures = blockManagerInfo.values .map { bmInfo =>
290+ val removeRddFromExecutorsFutures = allAliveBlockManagerInfos .map { bmInfo =>
282291 bmInfo.storageEndpoint.ask[Int ](removeMsg).recover {
283292 // use 0 as default value means no blocks were removed
284293 handleBlockRemovalFailure(" RDD" , rddId.toString, bmInfo.blockManagerId, 0 )
@@ -304,7 +313,7 @@ class BlockManagerMasterEndpoint(
304313 // Nothing to do in the BlockManagerMasterEndpoint data structures
305314 val removeMsg = RemoveShuffle (shuffleId)
306315 Future .sequence(
307- blockManagerInfo.values .map { bm =>
316+ allAliveBlockManagerInfos .map { bm =>
308317 bm.storageEndpoint.ask[Boolean ](removeMsg).recover {
309318 // use false as default value means no shuffle data were removed
310319 handleBlockRemovalFailure(" shuffle" , shuffleId.toString, bm.blockManagerId, false )
@@ -320,7 +329,7 @@ class BlockManagerMasterEndpoint(
320329 */
321330 private def removeBroadcast (broadcastId : Long , removeFromDriver : Boolean ): Future [Seq [Int ]] = {
322331 val removeMsg = RemoveBroadcast (broadcastId, removeFromDriver)
323- val requiredBlockManagers = blockManagerInfo.values .filter { info =>
332+ val requiredBlockManagers = allAliveBlockManagerInfos .filter { info =>
324333 removeFromDriver || ! info.blockManagerId.isDriver
325334 }
326335 val futures = requiredBlockManagers.map { bm =>
@@ -336,13 +345,24 @@ class BlockManagerMasterEndpoint(
336345 private def removeBlockManager (blockManagerId : BlockManagerId ): Unit = {
337346 val info = blockManagerInfo(blockManagerId)
338347
348+ // SPARK-35011: Not removing info from the blockManagerInfo map, but only setting the removal
349+ // timestamp of the executor in BlockManagerInfo. This info will be removed from
350+ // blockManagerInfo map by the blockManagerInfoCleaner once
351+ // now() - info.executorRemovalTs > executorTimeoutMs.
352+ //
353+ // We are delaying the removal of BlockManagerInfo to avoid a BlockManager reregistration
354+ // while a executor is shutting. This unwanted reregistration causes inconsistent bookkeeping
355+ // of executors in Spark.
356+ // Delaying this removal until blockManagerInfoCleaner decides to remove it ensures
357+ // BlockManagerMasterHeartbeatEndpoint does not ask the BlockManager on a recently removed
358+ // executor to reregister on BlockManagerHeartbeat message.
359+ info.setExecutorRemovalTs()
360+
339361 // Remove the block manager from blockManagerIdByExecutor.
340362 blockManagerIdByExecutor -= blockManagerId.executorId
341363 decommissioningBlockManagerSet.remove(blockManagerId)
342364
343- // Remove it from blockManagerInfo and remove all the blocks.
344- blockManagerInfo.remove(blockManagerId)
345-
365+ // remove all the blocks.
346366 val iterator = info.blocks.keySet.iterator
347367 while (iterator.hasNext) {
348368 val blockId = iterator.next
@@ -363,7 +383,7 @@ class BlockManagerMasterEndpoint(
363383 val i = (new Random (blockId.hashCode)).nextInt(locations.size)
364384 val blockLocations = locations.toSeq
365385 val candidateBMId = blockLocations(i)
366- blockManagerInfo.get (candidateBMId).foreach { bm =>
386+ aliveBlockManagerInfo (candidateBMId).foreach { bm =>
367387 val remainingLocations = locations.toSeq.filter(bm => bm != candidateBMId)
368388 val replicateMsg = ReplicateBlock (blockId, remainingLocations, maxReplicas)
369389 bm.storageEndpoint.ask[Boolean ](replicateMsg)
@@ -399,16 +419,16 @@ class BlockManagerMasterEndpoint(
399419 */
400420 private def getReplicateInfoForRDDBlocks (blockManagerId : BlockManagerId ): Seq [ReplicateBlock ] = {
401421 try {
402- val info = blockManagerInfo(blockManagerId)
403-
404- val rddBlocks = info.blocks.keySet().asScala.filter(_.isRDD)
405- rddBlocks.map { blockId =>
406- val currentBlockLocations = blockLocations.get(blockId)
407- val maxReplicas = currentBlockLocations.size + 1
408- val remainingLocations = currentBlockLocations.toSeq.filter(bm => bm != blockManagerId )
409- val replicateMsg = ReplicateBlock (blockId, remainingLocations, maxReplicas)
410- replicateMsg
411- }.toSeq
422+ aliveBlockManagerInfo(blockManagerId).map { info =>
423+ val rddBlocks = info.blocks.keySet().asScala.filter(_.isRDD)
424+ rddBlocks.map { blockId =>
425+ val currentBlockLocations = blockLocations.get(blockId)
426+ val maxReplicas = currentBlockLocations.size + 1
427+ val remainingLocations = currentBlockLocations.toSeq.filter(bm => bm != blockManagerId)
428+ val replicateMsg = ReplicateBlock (blockId, remainingLocations, maxReplicas )
429+ replicateMsg
430+ }.toSeq
431+ }.getOrElse( Seq .empty[ ReplicateBlock ])
412432 } catch {
413433 // If the block manager has already exited, nothing to replicate.
414434 case e : java.util.NoSuchElementException =>
@@ -422,8 +442,7 @@ class BlockManagerMasterEndpoint(
422442 val locations = blockLocations.get(blockId)
423443 if (locations != null ) {
424444 locations.foreach { blockManagerId : BlockManagerId =>
425- val blockManager = blockManagerInfo.get(blockManagerId)
426- blockManager.foreach { bm =>
445+ aliveBlockManagerInfo(blockManagerId).foreach { bm =>
427446 // Remove the block from the BlockManager.
428447 // Doesn't actually wait for a confirmation and the message might get lost.
429448 // If message loss becomes frequent, we should add retry logic here.
@@ -438,14 +457,14 @@ class BlockManagerMasterEndpoint(
438457
439458 // Return a map from the block manager id to max memory and remaining memory.
440459 private def memoryStatus : Map [BlockManagerId , (Long , Long )] = {
441- blockManagerInfo .map { case (blockManagerId, info) =>
442- (blockManagerId, (info.maxMem, info.remainingMem))
460+ allAliveBlockManagerInfos .map { info =>
461+ (info. blockManagerId, (info.maxMem, info.remainingMem))
443462 }.toMap
444463 }
445464
446465 private def storageStatus : Array [StorageStatus ] = {
447- blockManagerInfo .map { case (blockManagerId, info) =>
448- new StorageStatus (blockManagerId, info.maxMem, Some (info.maxOnHeapMem),
466+ allAliveBlockManagerInfos .map { info =>
467+ new StorageStatus (info. blockManagerId, info.maxMem, Some (info.maxOnHeapMem),
449468 Some (info.maxOffHeapMem), info.blocks.asScala)
450469 }.toArray
451470 }
@@ -467,7 +486,7 @@ class BlockManagerMasterEndpoint(
467486 * Futures to avoid potential deadlocks. This can arise if there exists a block manager
468487 * that is also waiting for this master endpoint's response to a previous message.
469488 */
470- blockManagerInfo.values .map { info =>
489+ allAliveBlockManagerInfos .map { info =>
471490 val blockStatusFuture =
472491 if (askStorageEndpoints) {
473492 info.storageEndpoint.ask[Option [BlockStatus ]](getBlockStatus)
@@ -491,7 +510,7 @@ class BlockManagerMasterEndpoint(
491510 askStorageEndpoints : Boolean ): Future [Seq [BlockId ]] = {
492511 val getMatchingBlockIds = GetMatchingBlockIds (filter)
493512 Future .sequence(
494- blockManagerInfo.values .map { info =>
513+ allAliveBlockManagerInfos .map { info =>
495514 val future =
496515 if (askStorageEndpoints) {
497516 info.storageEndpoint.ask[Seq [BlockId ]](getMatchingBlockIds)
@@ -557,9 +576,10 @@ class BlockManagerMasterEndpoint(
557576 if (pushBasedShuffleEnabled) {
558577 addMergerLocation(id)
559578 }
579+
580+ listenerBus.post(SparkListenerBlockManagerAdded (time, id,
581+ maxOnHeapMemSize + maxOffHeapMemSize, Some (maxOnHeapMemSize), Some (maxOffHeapMemSize)))
560582 }
561- listenerBus.post(SparkListenerBlockManagerAdded (time, id, maxOnHeapMemSize + maxOffHeapMemSize,
562- Some (maxOnHeapMemSize), Some (maxOffHeapMemSize)))
563583 id
564584 }
565585
@@ -647,7 +667,7 @@ class BlockManagerMasterEndpoint(
647667 if (externalShuffleServiceRddFetchEnabled && bmId.port == externalShuffleServicePort) {
648668 Option (blockStatusByShuffleService(bmId).get(blockId))
649669 } else {
650- blockManagerInfo.get (bmId).flatMap(_.getStatus(blockId))
670+ aliveBlockManagerInfo (bmId).flatMap(_.getStatus(blockId))
651671 }
652672 }
653673
@@ -658,8 +678,7 @@ class BlockManagerMasterEndpoint(
658678 // can be used to access this block even when the original executor is already stopped.
659679 loc.host == requesterHost &&
660680 (loc.port == externalShuffleServicePort ||
661- blockManagerInfo
662- .get(loc)
681+ aliveBlockManagerInfo(loc)
663682 .flatMap(_.getStatus(blockId).map(_.storageLevel.useDisk))
664683 .getOrElse(false ))
665684 }.flatMap { bmId => Option (executorIdToLocalDirs.getIfPresent(bmId.executorId)) }
@@ -676,7 +695,7 @@ class BlockManagerMasterEndpoint(
676695
677696 /** Get the list of the peers of the given block manager */
678697 private def getPeers (blockManagerId : BlockManagerId ): Seq [BlockManagerId ] = {
679- val blockManagerIds = blockManagerInfo.keySet
698+ val blockManagerIds = allAliveBlockManagerInfos.map(_.blockManagerId).toSet
680699 if (blockManagerIds.contains(blockManagerId)) {
681700 blockManagerIds
682701 .filterNot { _.isDriver }
@@ -728,15 +747,35 @@ class BlockManagerMasterEndpoint(
728747 private def getExecutorEndpointRef (executorId : String ): Option [RpcEndpointRef ] = {
729748 for (
730749 blockManagerId <- blockManagerIdByExecutor.get(executorId);
731- info <- blockManagerInfo.get (blockManagerId)
750+ info <- aliveBlockManagerInfo (blockManagerId)
732751 ) yield {
733752 info.storageEndpoint
734753 }
735754 }
736755
737756 override def onStop (): Unit = {
738757 askThreadPool.shutdownNow()
758+ blockManagerInfoCleaner.shutdownNow()
759+ }
760+
761+ private def cleanBlockManagerInfo (): Unit = {
762+ logDebug(" Cleaning blockManagerInfo" )
763+ val now = System .currentTimeMillis()
764+ val expiredBmIds = blockManagerInfo.filter { case (_, bmInfo) =>
765+ // bmInfo.executorRemovalTs.get cannot be None when BM is not alive
766+ ! bmInfo.isAlive && (now - bmInfo.executorRemovalTs.get) > executorTimeoutMs
767+ }.keys
768+ expiredBmIds.foreach { bmId =>
769+ logInfo(s " Cleaning expired $bmId from blockManagerInfo " )
770+ blockManagerInfo.remove(bmId)
771+ }
739772 }
773+
774+ @ inline private def aliveBlockManagerInfo (bmId : BlockManagerId ): Option [BlockManagerInfo ] =
775+ blockManagerInfo.get(bmId).filter(_.isAlive)
776+
777+ @ inline private def allAliveBlockManagerInfos : Iterable [BlockManagerInfo ] =
778+ blockManagerInfo.values.filter(_.isAlive)
740779}
741780
742781@ DeveloperApi
@@ -764,6 +803,7 @@ private[spark] class BlockManagerInfo(
764803
765804 private var _lastSeenMs : Long = timeMs
766805 private var _remainingMem : Long = maxMem
806+ private var _executorRemovalTs : Option [Long ] = None
767807
768808 // Mapping from block id to its status.
769809 private val _blocks = new JHashMap [BlockId , BlockStatus ]
@@ -878,4 +918,16 @@ private[spark] class BlockManagerInfo(
878918 def clear (): Unit = {
879919 _blocks.clear()
880920 }
921+
922+ def executorRemovalTs : Option [Long ] = _executorRemovalTs
923+
924+ def isAlive : Boolean = _executorRemovalTs.isEmpty
925+
926+ def setExecutorRemovalTs (): Unit = {
927+ if (! isAlive) {
928+ logWarning(s " executorRemovalTs is already set to ${_executorRemovalTs.get}" )
929+ } else {
930+ _executorRemovalTs = Some (System .currentTimeMillis())
931+ }
932+ }
881933}
0 commit comments