Skip to content

Commit cc93e7e

Browse files
committed
YARN-2566. DefaultContainerExecutor should pick a working directory randomly. (Zhihai Xu via kasha)
1 parent da709a2 commit cc93e7e

File tree

6 files changed

+173
-16
lines changed

6 files changed

+173
-16
lines changed

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@
161161

162162
@InterfaceAudience.Public
163163
@InterfaceStability.Evolving /*Evolving for a release,to be changed to Stable */
164-
public final class FileContext {
164+
public class FileContext {
165165

166166
public static final Log LOG = LogFactory.getLog(FileContext.class);
167167
/**

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,9 @@ Release 2.6.0 - UNRELEASED
643643
queue to which the apps were submitted is changed across RM restart.
644644
(Craig Welch & Chang Li via jianhe)
645645

646+
YARN-2566. DefaultContainerExecutor should pick a working directory randomly.
647+
(Zhihai Xu via kasha)
648+
646649
Release 2.5.1 - 2014-09-05
647650

648651
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -840,10 +840,10 @@ public class YarnConfiguration extends Configuration {
840840
public static final String NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE =
841841
NM_DISK_HEALTH_CHECK_PREFIX + "max-disk-utilization-per-disk-percentage";
842842
/**
843-
* By default, 100% of the disk can be used before it is marked as offline.
843+
* By default, 90% of the disk can be used before it is marked as offline.
844844
*/
845845
public static final float DEFAULT_NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE =
846-
100.0F;
846+
90.0F;
847847

848848
/**
849849
* The minimum space that must be available on a local dir for it to be used.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -948,7 +948,7 @@
948948
for full disk. This applies to yarn-nodemanager.local-dirs and
949949
yarn.nodemanager.log-dirs.</description>
950950
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
951-
<value>100.0</value>
951+
<value>90.0</value>
952952
</property>
953953

954954
<property>

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DefaultContainerExecutor.java

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import java.util.Arrays;
3232
import java.util.EnumSet;
3333
import java.util.List;
34+
import java.util.Random;
3435

3536
import org.apache.commons.logging.Log;
3637
import org.apache.commons.logging.LogFactory;
@@ -103,8 +104,8 @@ public synchronized void startLocalizer(Path nmPrivateContainerTokensPath,
103104
createAppDirs(localDirs, user, appId);
104105
createAppLogDirs(appId, logDirs, user);
105106

106-
// TODO: Why pick first app dir. The same in LCE why not random?
107-
Path appStorageDir = getFirstApplicationDir(localDirs, user, appId);
107+
// randomly choose the local directory
108+
Path appStorageDir = getWorkingDir(localDirs, user, appId);
108109

109110
String tokenFn = String.format(ContainerLocalizer.TOKEN_FILE_NAME_FMT, locId);
110111
Path tokenDst = new Path(appStorageDir, tokenFn);
@@ -466,6 +467,10 @@ protected Path getFirstApplicationDir(List<String> localDirs, String user,
466467
return getApplicationDir(new Path(localDirs.get(0)), user, appId);
467468
}
468469

470+
private long getDiskFreeSpace(Path base) throws IOException {
471+
return lfs.getFsStatus(base).getRemaining();
472+
}
473+
469474
private Path getApplicationDir(Path base, String user, String appId) {
470475
return new Path(getAppcacheDir(base, user), appId);
471476
}
@@ -484,6 +489,56 @@ private Path getFileCacheDir(Path base, String user) {
484489
ContainerLocalizer.FILECACHE);
485490
}
486491

492+
private Path getWorkingDir(List<String> localDirs, String user,
493+
String appId) throws IOException {
494+
Path appStorageDir = null;
495+
long totalAvailable = 0L;
496+
long[] availableOnDisk = new long[localDirs.size()];
497+
int i = 0;
498+
// randomly choose the app directory
499+
// the chance of picking a directory is proportional to
500+
// the available space on the directory.
501+
// firstly calculate the sum of all available space on these directories
502+
for (String localDir : localDirs) {
503+
Path curBase = getApplicationDir(new Path(localDir),
504+
user, appId);
505+
long space = 0L;
506+
try {
507+
space = getDiskFreeSpace(curBase);
508+
} catch (IOException e) {
509+
LOG.warn("Unable to get Free Space for " + curBase.toString(), e);
510+
}
511+
availableOnDisk[i++] = space;
512+
totalAvailable += space;
513+
}
514+
515+
// throw an IOException if totalAvailable is 0.
516+
if (totalAvailable <= 0L) {
517+
throw new IOException("Not able to find a working directory for "
518+
+ user);
519+
}
520+
521+
// make probability to pick a directory proportional to
522+
// the available space on the directory.
523+
Random r = new Random();
524+
long randomPosition = Math.abs(r.nextLong()) % totalAvailable;
525+
int dir = 0;
526+
// skip zero available space directory,
527+
// because totalAvailable is greater than 0 and randomPosition
528+
// is less than totalAvailable, we can find a valid directory
529+
// with nonzero available space.
530+
while (availableOnDisk[dir] == 0L) {
531+
dir++;
532+
}
533+
while (randomPosition > availableOnDisk[dir]) {
534+
randomPosition -= availableOnDisk[dir++];
535+
}
536+
appStorageDir = getApplicationDir(new Path(localDirs.get(dir)),
537+
user, appId);
538+
539+
return appStorageDir;
540+
}
541+
487542
protected void createDir(Path dirPath, FsPermission perms,
488543
boolean createParent, String user) throws IOException {
489544
lfs.mkdir(dirPath, perms, createParent);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDefaultContainerExecutor.java

Lines changed: 109 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import static org.mockito.Mockito.doAnswer;
2828
import static org.junit.Assert.assertTrue;
2929

30+
import java.io.DataOutputStream;
3031
import java.io.File;
3132
import java.io.FileNotFoundException;
3233
import java.io.FileReader;
@@ -41,14 +42,6 @@
4142
import java.util.List;
4243
import java.util.Random;
4344

44-
import org.apache.hadoop.fs.FileUtil;
45-
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
46-
import org.apache.hadoop.yarn.api.records.ApplicationId;
47-
import org.apache.hadoop.yarn.api.records.ContainerId;
48-
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
49-
import org.apache.hadoop.yarn.conf.YarnConfiguration;
50-
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
51-
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
5245

5346
import org.apache.hadoop.conf.Configuration;
5447
import org.apache.hadoop.fs.AbstractFileSystem;
@@ -57,20 +50,30 @@
5750
import org.apache.hadoop.fs.FSDataOutputStream;
5851
import org.apache.hadoop.fs.FileContext;
5952
import org.apache.hadoop.fs.FileStatus;
53+
import org.apache.hadoop.fs.FileUtil;
54+
import org.apache.hadoop.fs.FsStatus;
6055
import org.apache.hadoop.fs.Options.CreateOpts;
6156
import org.apache.hadoop.fs.Path;
6257
import org.apache.hadoop.fs.permission.FsPermission;
6358
import org.apache.hadoop.io.DataInputBuffer;
6459
import org.apache.hadoop.io.DataOutputBuffer;
60+
import org.apache.hadoop.security.Credentials;
6561
import org.apache.hadoop.util.Progressable;
62+
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
63+
import org.apache.hadoop.yarn.api.records.ApplicationId;
64+
import org.apache.hadoop.yarn.api.records.ContainerId;
65+
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
66+
import org.apache.hadoop.yarn.conf.YarnConfiguration;
67+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
68+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerDiagnosticsUpdateEvent;
6669
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer;
6770
import org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.FakeFSDataInputStream;
6871

72+
import org.junit.After;
6973
import org.junit.AfterClass;
74+
import org.junit.Assert;
7075
import org.junit.Before;
7176
import org.junit.Test;
72-
import org.junit.After;
73-
import org.junit.Assert;
7477
import org.mockito.invocation.InvocationOnMock;
7578
import org.mockito.stubbing.Answer;
7679

@@ -296,6 +299,102 @@ public Object answer(InvocationOnMock invocationOnMock)
296299
}
297300
}
298301

302+
@Test(timeout = 30000)
303+
public void testStartLocalizer()
304+
throws IOException, InterruptedException {
305+
InetSocketAddress localizationServerAddress;
306+
final Path firstDir = new Path(BASE_TMP_PATH, "localDir1");
307+
List<String> localDirs = new ArrayList<String>();
308+
final Path secondDir = new Path(BASE_TMP_PATH, "localDir2");
309+
List<String> logDirs = new ArrayList<String>();
310+
final Path logDir = new Path(BASE_TMP_PATH, "logDir");
311+
final Path tokenDir = new Path(BASE_TMP_PATH, "tokenDir");
312+
FsPermission perms = new FsPermission((short)0770);
313+
314+
Configuration conf = new Configuration();
315+
localizationServerAddress = conf.getSocketAddr(
316+
YarnConfiguration.NM_BIND_HOST,
317+
YarnConfiguration.NM_LOCALIZER_ADDRESS,
318+
YarnConfiguration.DEFAULT_NM_LOCALIZER_ADDRESS,
319+
YarnConfiguration.DEFAULT_NM_LOCALIZER_PORT);
320+
321+
final FileContext mockLfs = spy(FileContext.getLocalFSFileContext(conf));
322+
final FileContext.Util mockUtil = spy(mockLfs.util());
323+
doAnswer(new Answer() {
324+
@Override
325+
public Object answer(InvocationOnMock invocationOnMock)
326+
throws Throwable {
327+
return mockUtil;
328+
}
329+
}).when(mockLfs).util();
330+
doAnswer(new Answer() {
331+
@Override
332+
public Object answer(InvocationOnMock invocationOnMock)
333+
throws Throwable {
334+
Path dest = (Path) invocationOnMock.getArguments()[1];
335+
if (dest.toString().contains(firstDir.toString())) {
336+
// throw an Exception when copy token to the first local dir
337+
// to simulate no space on the first drive
338+
throw new IOException("No space on this drive " +
339+
dest.toString());
340+
} else {
341+
// copy token to the second local dir
342+
DataOutputStream tokenOut = null;
343+
try {
344+
Credentials credentials = new Credentials();
345+
tokenOut = mockLfs.create(dest,
346+
EnumSet.of(CREATE, OVERWRITE));
347+
credentials.writeTokenStorageToStream(tokenOut);
348+
} finally {
349+
if (tokenOut != null) {
350+
tokenOut.close();
351+
}
352+
}
353+
}
354+
return null;
355+
}
356+
}).when(mockUtil).copy(any(Path.class), any(Path.class));
357+
doAnswer(new Answer() {
358+
@Override
359+
public Object answer(InvocationOnMock invocationOnMock)
360+
throws Throwable {
361+
Path p = (Path) invocationOnMock.getArguments()[0];
362+
// let second local directory return more free space than
363+
// first local directory
364+
if (p.toString().contains(firstDir.toString())) {
365+
return new FsStatus(2000, 2000, 0);
366+
} else {
367+
return new FsStatus(1000, 0, 1000);
368+
}
369+
}
370+
}).when(mockLfs).getFsStatus(any(Path.class));
371+
372+
DefaultContainerExecutor mockExec = spy(new DefaultContainerExecutor(
373+
mockLfs));
374+
mockExec.setConf(conf);
375+
localDirs.add(mockLfs.makeQualified(firstDir).toString());
376+
localDirs.add(mockLfs.makeQualified(secondDir).toString());
377+
logDirs.add(mockLfs.makeQualified(logDir).toString());
378+
conf.setStrings(YarnConfiguration.NM_LOCAL_DIRS,
379+
localDirs.toArray(new String[localDirs.size()]));
380+
conf.set(YarnConfiguration.NM_LOG_DIRS, logDir.toString());
381+
mockLfs.mkdir(tokenDir, perms, true);
382+
Path nmPrivateCTokensPath = new Path(tokenDir, "test.tokens");
383+
String appSubmitter = "nobody";
384+
String appId = "APP_ID";
385+
String locId = "LOC_ID";
386+
try {
387+
mockExec.startLocalizer(nmPrivateCTokensPath, localizationServerAddress,
388+
appSubmitter, appId, locId, localDirs, logDirs);
389+
} catch (IOException e) {
390+
Assert.fail("StartLocalizer failed to copy token file " + e);
391+
} finally {
392+
mockExec.deleteAsUser(appSubmitter, firstDir);
393+
mockExec.deleteAsUser(appSubmitter, secondDir);
394+
mockExec.deleteAsUser(appSubmitter, logDir);
395+
deleteTmpFiles();
396+
}
397+
}
299398
// @Test
300399
// public void testInit() throws IOException, InterruptedException {
301400
// Configuration conf = new Configuration();

0 commit comments

Comments
 (0)