Skip to content

Commit e5d8060

Browse files
committed
YARN-11709. NodeManager should be shut down or blacklisted when it cannot run program /var/lib/yarn-ce/bin/container-executor
1 parent c835adb commit e5d8060

File tree

4 files changed

+87
-10
lines changed

4 files changed

+87
-10
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ public Path localizeClasspathJar(Path jarPath, Path target, String owner)
173173
* @throws InterruptedException if application init thread is halted by NM
174174
*/
175175
public abstract void startLocalizer(LocalizerStartContext ctx)
176-
throws IOException, InterruptedException;
176+
throws IOException, InterruptedException, ConfigurationException;
177177

178178
/**
179179
* Prepare the container prior to the launch environment being written.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,7 @@ public void stop() {
389389

390390
@Override
391391
public void startLocalizer(LocalizerStartContext ctx)
392-
throws IOException, InterruptedException {
392+
throws IOException, InterruptedException, ConfigurationException {
393393
Path nmPrivateContainerTokensPath = ctx.getNmPrivateContainerTokens();
394394
InetSocketAddress nmAddr = ctx.getNmAddr();
395395
String user = ctx.getUser();
@@ -440,9 +440,9 @@ public void startLocalizer(LocalizerStartContext ctx)
440440
localizerArgs = replaceWithContainerLogDir(localizerArgs, containerLogDir);
441441

442442
initializeContainerOp.appendArgs(localizerArgs);
443+
Configuration conf = super.getConf();
443444

444445
try {
445-
Configuration conf = super.getConf();
446446
PrivilegedOperationExecutor privilegedOperationExecutor =
447447
getPrivilegedOperationExecutor();
448448

@@ -452,7 +452,26 @@ public void startLocalizer(LocalizerStartContext ctx)
452452
} catch (PrivilegedOperationException e) {
453453
int exitCode = e.getExitCode();
454454
LOG.warn("Exit code from container {} startLocalizer is : {}",
455-
locId, exitCode, e);
455+
locId, exitCode, e);
456+
457+
if (exitCode ==
458+
ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode() ||
459+
exitCode == ExitCode.INVALID_CONFIG_FILE.getExitCode()) {
460+
throw new ConfigurationException("Application " + appId + " initialization failed" +
461+
" (exitCode=" + exitCode + ") with an unrecoverable config error. " +
462+
"Output: " + e.getOutput(), e);
463+
}
464+
465+
// Check if the failure was due to a missing container-executor binary
466+
Throwable cause = e.getCause() != null ? e.getCause() : e;
467+
if (cause instanceof IOException) {
468+
IOException io = (IOException) cause;
469+
if (io.getMessage().contains("No such file or directory")) {
470+
throw new ConfigurationException("Application " + appId + " initialization failed" +
471+
"(exitCode=" + exitCode + "). Container executor not found at "
472+
+ getContainerExecutorExecutablePath(conf), e);
473+
}
474+
}
456475

457476
throw new IOException("Application " + appId + " initialization failed" +
458477
" (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static org.apache.hadoop.fs.CreateFlag.CREATE;
2121
import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
2222

23+
import org.apache.hadoop.yarn.exceptions.ConfigurationException;
2324
import org.apache.hadoop.yarn.server.nodemanager.recovery.RecoveryIterator;
2425
import org.slf4j.Logger;
2526
import org.slf4j.LoggerFactory;
@@ -1255,7 +1256,7 @@ public void run() {
12551256
try {
12561257
// Get nmPrivateDir
12571258
nmPrivateCTokensPath = dirsHandler.getLocalPathForWrite(
1258-
NM_PRIVATE_DIR + Path.SEPARATOR + tokenFileName);
1259+
NM_PRIVATE_DIR + Path.SEPARATOR + tokenFileName);
12591260

12601261
// 0) init queue, etc.
12611262
// 1) write credentials to private dir
@@ -1275,10 +1276,13 @@ public void run() {
12751276
throw new IOException("All disks failed. "
12761277
+ dirsHandler.getDisksHealthReport(false));
12771278
}
1278-
// TODO handle ExitCodeException separately?
1279-
} catch (FSError fe) {
1280-
exception = fe;
1281-
} catch (Exception e) {
1279+
// TODO handle ExitCodeException separately?
1280+
} catch (ConfigurationException e) {
1281+
exception = e;
1282+
LOG.error("Failed to launch localizer for {}, due to configuration error. " +
1283+
"Marking the node unhealthy.", localizerId, e);
1284+
nmContext.getNodeStatusUpdater().reportException(e);
1285+
} catch (Exception | FSError e) {
12821286
exception = e;
12831287
} finally {
12841288
if (exception != null) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ public void testStartLocalizer() throws IOException {
336336
assertThat(result.get(23)).isEqualTo("8040");
337337
assertThat(result.get(24)).isEqualTo("nmPrivateCTokensPath");
338338

339-
} catch (InterruptedException e) {
339+
} catch (ConfigurationException | InterruptedException e) {
340340
LOG.error("Error:"+e.getMessage(),e);
341341
Assert.fail();
342342
}
@@ -643,6 +643,60 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
643643
e.getMessage().contains("exitCode"));
644644
}
645645

646+
final int[] exitCodesToThrow = {
647+
LinuxContainerExecutor.ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode(),
648+
LinuxContainerExecutor.ExitCode.INVALID_CONFIG_FILE.getExitCode(),
649+
};
650+
651+
for (int i = 0; i < exitCodesToThrow.length; i++) {
652+
int exitCode = exitCodesToThrow[i];
653+
doThrow(new PrivilegedOperationException("invalid config", exitCode, null, null))
654+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
655+
any(), any(PrivilegedOperation.class),
656+
any(), any(), anyBoolean(), anyBoolean());
657+
658+
try {
659+
lce.startLocalizer(new LocalizerStartContext.Builder()
660+
.setNmPrivateContainerTokens(nmPrivateCTokensPath)
661+
.setNmAddr(address)
662+
.setUser(appSubmitter)
663+
.setAppId(appId.toString())
664+
.setLocId("12345")
665+
.setDirsHandler(dirService)
666+
.build());
667+
Assert.fail("startLocalizer should have thrown a ConfigurationException");
668+
} catch (ConfigurationException e) {
669+
assertTrue("Unexpected exception " + e,
670+
e.getMessage().contains("exitCode=" + exitCode));
671+
}
672+
}
673+
674+
doThrow(new PrivilegedOperationException("IO error", new IOException("No such file or directory")))
675+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
676+
any(), any(PrivilegedOperation.class),
677+
any(), any(), anyBoolean(), anyBoolean());
678+
679+
try {
680+
lce.startLocalizer(new LocalizerStartContext.Builder()
681+
.setNmPrivateContainerTokens(nmPrivateCTokensPath)
682+
.setNmAddr(address)
683+
.setUser(appSubmitter)
684+
.setAppId(appId.toString())
685+
.setLocId("12345")
686+
.setDirsHandler(dirService)
687+
.build());
688+
Assert.fail("startLocalizer should have thrown a ConfigurationException");
689+
} catch (ConfigurationException e) {
690+
assertTrue("Unexpected exception " + e,
691+
e.getMessage().contains("Container executor not found"));
692+
}
693+
694+
695+
doThrow(new PrivilegedOperationException("interrupted"))
696+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
697+
any(), any(PrivilegedOperation.class),
698+
any(), any(), anyBoolean(), anyBoolean());
699+
646700
lce.activateContainer(cid, new Path(workDir, "pid.txt"));
647701
lce.launchContainer(new ContainerStartContext.Builder()
648702
.setContainer(container)

0 commit comments

Comments
 (0)