Skip to content

Commit 6d7eedf

Browse files
committed
YARN-9194. Invalid event: REGISTERED and LAUNCH_FAILED at FAILED, and NullPointerException happens in RM while shutdown a NM. (lujie via wangda)
Change-Id: I4359f59a73a278a941f4bb9d106dd38c9cb471fe
1 parent 0a46bae commit 6d7eedf

File tree

2 files changed

+90
-4
lines changed
  • hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src

2 files changed

+90
-4
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/RMAppAttemptImpl.java

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,11 @@ RMAppAttemptEventType.STATUS_UPDATE, new StatusUpdateTransition())
437437
RMAppAttemptState.FAILED,
438438
EnumSet.of(
439439
RMAppAttemptEventType.LAUNCHED,
440+
RMAppAttemptEventType.LAUNCH_FAILED,
440441
RMAppAttemptEventType.EXPIRE,
441442
RMAppAttemptEventType.KILL,
442443
RMAppAttemptEventType.FAIL,
444+
RMAppAttemptEventType.REGISTERED,
443445
RMAppAttemptEventType.UNREGISTERED,
444446
RMAppAttemptEventType.STATUS_UPDATE,
445447
RMAppAttemptEventType.CONTAINER_ALLOCATED))
@@ -1203,10 +1205,16 @@ public RMAppAttemptState transition(RMAppAttemptImpl appAttempt,
12031205
}
12041206

12051207
// Set the masterContainer
1206-
appAttempt.setMasterContainer(amContainerAllocation.getContainers()
1207-
.get(0));
1208+
Container amContainer = amContainerAllocation.getContainers().get(0);
12081209
RMContainerImpl rmMasterContainer = (RMContainerImpl)appAttempt.scheduler
1209-
.getRMContainer(appAttempt.getMasterContainer().getId());
1210+
.getRMContainer(amContainer.getId());
1211+
//while one NM is removed, the scheduler will clean the container,the
1212+
//following CONTAINER_FINISHED event will handle the cleaned container.
1213+
//so just return RMAppAttemptState.SCHEDULED
1214+
if (rmMasterContainer == null) {
1215+
return RMAppAttemptState.SCHEDULED;
1216+
}
1217+
appAttempt.setMasterContainer(amContainer);
12101218
rmMasterContainer.setAMContainer(true);
12111219
// The node set in NMTokenSecrentManager is used for marking whether the
12121220
// NMToken has been issued for this node to the AM.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/attempt/TestRMAppAttemptTransitions.java

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -986,7 +986,7 @@ public void testLaunchedAtFinalSaving() {
986986
public void testAttemptAddedAtFinalSaving() {
987987
submitApplicationAttempt();
988988

989-
// SUBNITED->FINAL_SAVING
989+
// SUBMITTED->FINAL_SAVING
990990
applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt
991991
.getAppAttemptId(), RMAppAttemptEventType.KILL));
992992
assertEquals(RMAppAttemptState.FINAL_SAVING,
@@ -999,6 +999,56 @@ public void testAttemptAddedAtFinalSaving() {
999999
applicationAttempt.getAppAttemptState());
10001000
}
10011001

1002+
@Test(timeout = 10000)
1003+
public void testAttemptRegisteredAtFailed() {
1004+
Container amContainer = allocateApplicationAttempt();
1005+
launchApplicationAttempt(amContainer);
1006+
1007+
//send CONTAINER_FINISHED event
1008+
NodeId anyNodeId = NodeId.newInstance("host", 1234);
1009+
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
1010+
applicationAttempt.getAppAttemptId(), BuilderUtils.newContainerStatus(
1011+
amContainer.getId(), ContainerState.COMPLETE, "", 0,
1012+
amContainer.getResource()), anyNodeId));
1013+
assertEquals(RMAppAttemptState.FINAL_SAVING,
1014+
applicationAttempt.getAppAttemptState());
1015+
1016+
sendAttemptUpdateSavedEvent(applicationAttempt);
1017+
assertEquals(RMAppAttemptState.FAILED,
1018+
applicationAttempt.getAppAttemptState());
1019+
1020+
//send REGISTERED event
1021+
applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt
1022+
.getAppAttemptId(), RMAppAttemptEventType.REGISTERED));
1023+
1024+
assertEquals(RMAppAttemptState.FAILED,
1025+
applicationAttempt.getAppAttemptState());
1026+
}
1027+
1028+
@Test
1029+
public void testAttemptLaunchFailedAtFailed() {
1030+
Container amContainer = allocateApplicationAttempt();
1031+
launchApplicationAttempt(amContainer);
1032+
//send CONTAINER_FINISHED event
1033+
NodeId anyNodeId = NodeId.newInstance("host", 1234);
1034+
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
1035+
applicationAttempt.getAppAttemptId(), BuilderUtils.newContainerStatus(
1036+
amContainer.getId(), ContainerState.COMPLETE, "", 0,
1037+
amContainer.getResource()), anyNodeId));
1038+
assertEquals(RMAppAttemptState.FINAL_SAVING,
1039+
applicationAttempt.getAppAttemptState());
1040+
sendAttemptUpdateSavedEvent(applicationAttempt);
1041+
assertEquals(RMAppAttemptState.FAILED,
1042+
applicationAttempt.getAppAttemptState());
1043+
1044+
//send LAUNCH_FAILED event
1045+
applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt
1046+
.getAppAttemptId(), RMAppAttemptEventType.LAUNCH_FAILED));
1047+
1048+
assertEquals(RMAppAttemptState.FAILED,
1049+
applicationAttempt.getAppAttemptState());
1050+
}
1051+
10021052
@Test
10031053
public void testAMCrashAtAllocated() {
10041054
Container amContainer = allocateApplicationAttempt();
@@ -1598,6 +1648,34 @@ public void testFailedToFailed() {
15981648
assertTrue(found);
15991649
}
16001650

1651+
@Test
1652+
public void testContainerRemovedBeforeAllocate() {
1653+
scheduleApplicationAttempt();
1654+
1655+
// Mock the allocation of AM container
1656+
Container container = mock(Container.class);
1657+
Resource resource = BuilderUtils.newResource(2048, 1);
1658+
when(container.getId()).thenReturn(
1659+
BuilderUtils.newContainerId(applicationAttempt.getAppAttemptId(), 1));
1660+
when(container.getResource()).thenReturn(resource);
1661+
Allocation allocation = mock(Allocation.class);
1662+
when(allocation.getContainers()).
1663+
thenReturn(Collections.singletonList(container));
1664+
when(scheduler.allocate(any(ApplicationAttemptId.class), any(List.class),
1665+
any(List.class), any(List.class), any(List.class), any(List.class),
1666+
any(ContainerUpdates.class))).
1667+
thenReturn(allocation);
1668+
1669+
//container removed, so return null
1670+
when(scheduler.getRMContainer(container.getId())).
1671+
thenReturn(null);
1672+
1673+
applicationAttempt.handle(
1674+
new RMAppAttemptEvent(applicationAttempt.getAppAttemptId(),
1675+
RMAppAttemptEventType.CONTAINER_ALLOCATED));
1676+
assertEquals(RMAppAttemptState.SCHEDULED,
1677+
applicationAttempt.getAppAttemptState());
1678+
}
16011679

16021680
@SuppressWarnings("deprecation")
16031681
@Test

0 commit comments

Comments
 (0)