Skip to content

Commit f49512b

Browse files
[SPARK-24356] [CORE] Duplicate strings in File.path managed by FileSegmentManagedBuffer
This patch eliminates duplicate strings that come from the 'path' field of java.io.File objects created by FileSegmentManagedBuffer. That is, we want to avoid the situation when multiple File instances for thesame pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String instance. In some scenarios such duplicate strings may waste a lot of memory (~ 10% of the heap). To avoid that, we intern the pathname with String.intern(), and before that we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise, the code in java.io.File would normalize it later, creating a new "foo/bar" String copy. Unfortunately, the normalization code that java.io.File uses internally is in the package-private class java.io.FileSystem, so we cannot call it here directly. Added unit test
1 parent 900bc1f commit f49512b

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import java.util.concurrent.ExecutionException;
2525
import java.util.concurrent.Executor;
2626
import java.util.concurrent.Executors;
27+
import java.util.regex.Matcher;
28+
import java.util.regex.Pattern;
2729

2830
import com.fasterxml.jackson.annotation.JsonCreator;
2931
import com.fasterxml.jackson.annotation.JsonProperty;
@@ -59,13 +61,16 @@ public class ExternalShuffleBlockResolver {
5961
private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);
6062

6163
private static final ObjectMapper mapper = new ObjectMapper();
64+
6265
/**
6366
* This a common prefix to the key for each app registration we stick in leveldb, so they
6467
* are easy to find, since leveldb lets you search based on prefix.
6568
*/
6669
private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
6770
private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);
6871

72+
private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}");
73+
6974
// Map containing all registered executors' metadata.
7075
@VisibleForTesting
7176
final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
@@ -259,7 +264,8 @@ static File getFile(String[] localDirs, int subDirsPerLocalDir, String filename)
259264
int hash = JavaUtils.nonNegativeHash(filename);
260265
String localDir = localDirs[hash % localDirs.length];
261266
int subDirId = (hash / localDirs.length) % subDirsPerLocalDir;
262-
return new File(new File(localDir, String.format("%02x", subDirId)), filename);
267+
return new File(createNormalizedInternedPathname(
268+
localDir, String.format("%02x", subDirId), filename));
263269
}
264270

265271
void close() {
@@ -272,6 +278,28 @@ void close() {
272278
}
273279
}
274280

281+
/**
282+
* This method is needed to avoid the situation when multiple File instances for the
283+
* same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String.
284+
* According to measurements, in some scenarios such duplicate strings may waste a lot
285+
* of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that
286+
* we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise,
287+
* the internal code in java.io.File would normalize it later, creating a new "foo/bar"
288+
* String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File
289+
* uses, since it is in the package-private class java.io.FileSystem.
290+
*/
291+
@VisibleForTesting
292+
static String createNormalizedInternedPathname(String dir1, String dir2, String fname) {
293+
String pathname = dir1 + File.separator + dir2 + File.separator + fname;
294+
Matcher m = MULTIPLE_SEPARATORS.matcher(pathname);
295+
pathname = m.replaceAll("/");
296+
// A single trailing slash needs to be taken care of separately
297+
if (pathname.length() > 1 && pathname.endsWith("/")) {
298+
pathname = pathname.substring(0, pathname.length() - 1);
299+
}
300+
return pathname.intern();
301+
}
302+
275303
/** Simply encodes an executor's full ID, which is appId + execId. */
276304
public static class AppExecId {
277305
public final String appId;

common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.network.shuffle;
1919

20+
import java.io.File;
2021
import java.io.IOException;
2122
import java.io.InputStream;
2223
import java.io.InputStreamReader;
@@ -135,4 +136,23 @@ public void jsonSerializationOfExecutorRegistration() throws IOException {
135136
"\"subDirsPerLocalDir\": 7, \"shuffleManager\": " + "\"" + SORT_MANAGER + "\"}";
136137
assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class));
137138
}
139+
140+
@Test
141+
public void testNormalizeAndInternPathname() {
142+
assertPathsMatch("/foo", "bar", "baz", "/foo/bar/baz");
143+
assertPathsMatch("//foo/", "bar/", "//baz", "/foo/bar/baz");
144+
assertPathsMatch("foo", "bar", "baz///", "foo/bar/baz");
145+
assertPathsMatch("/foo/", "/bar//", "/baz", "/foo/bar/baz");
146+
assertPathsMatch("/", "", "", "/");
147+
assertPathsMatch("/", "/", "/", "/");
148+
}
149+
150+
private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) {
151+
String normPathname =
152+
ExternalShuffleBlockResolver.createNormalizedInternedPathname(p1, p2, p3);
153+
assertEquals(expectedPathname, normPathname);
154+
File file = new File(normPathname);
155+
String returnedPath = file.getPath();
156+
assertTrue(normPathname == returnedPath);
157+
}
138158
}

0 commit comments

Comments
 (0)