Skip to content

Commit de4feae

Browse files
misha-clouderasrowen
authored andcommitted
[SPARK-24356][CORE] Duplicate strings in File.path managed by FileSegmentManagedBuffer
This patch eliminates duplicate strings that come from the 'path' field of java.io.File objects created by FileSegmentManagedBuffer. That is, we want to avoid the situation when multiple File instances for the same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String instance. In some scenarios such duplicate strings may waste a lot of memory (~ 10% of the heap). To avoid that, we intern the pathname with String.intern(), and before that we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise, the code in java.io.File would normalize it later, creating a new "foo/bar" String copy. Unfortunately, the normalization code that java.io.File uses internally is in the package-private class java.io.FileSystem, so we cannot call it here directly. ## What changes were proposed in this pull request? Added code to ExternalShuffleBlockResolver.getFile(), that normalizes and then interns the pathname string before passing it to the File() constructor. ## How was this patch tested? Added unit test Author: Misha Dmitriev <[email protected]> Closes #21456 from countmdm/misha/spark-24356.
1 parent a36c1a6 commit de4feae

File tree

2 files changed

+49
-1
lines changed

2 files changed

+49
-1
lines changed

common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
import java.util.concurrent.ExecutionException;
2525
import java.util.concurrent.Executor;
2626
import java.util.concurrent.Executors;
27+
import java.util.regex.Matcher;
28+
import java.util.regex.Pattern;
2729

2830
import com.fasterxml.jackson.annotation.JsonCreator;
2931
import com.fasterxml.jackson.annotation.JsonProperty;
@@ -59,13 +61,16 @@ public class ExternalShuffleBlockResolver {
5961
private static final Logger logger = LoggerFactory.getLogger(ExternalShuffleBlockResolver.class);
6062

6163
private static final ObjectMapper mapper = new ObjectMapper();
64+
6265
/**
6366
* This a common prefix to the key for each app registration we stick in leveldb, so they
6467
* are easy to find, since leveldb lets you search based on prefix.
6568
*/
6669
private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
6770
private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);
6871

72+
private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}");
73+
6974
// Map containing all registered executors' metadata.
7075
@VisibleForTesting
7176
final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
@@ -302,7 +307,8 @@ static File getFile(String[] localDirs, int subDirsPerLocalDir, String filename)
302307
int hash = JavaUtils.nonNegativeHash(filename);
303308
String localDir = localDirs[hash % localDirs.length];
304309
int subDirId = (hash / localDirs.length) % subDirsPerLocalDir;
305-
return new File(new File(localDir, String.format("%02x", subDirId)), filename);
310+
return new File(createNormalizedInternedPathname(
311+
localDir, String.format("%02x", subDirId), filename));
306312
}
307313

308314
void close() {
@@ -315,6 +321,28 @@ void close() {
315321
}
316322
}
317323

324+
/**
325+
* This method is needed to avoid the situation when multiple File instances for the
326+
* same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String.
327+
* According to measurements, in some scenarios such duplicate strings may waste a lot
328+
* of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that
329+
* we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise,
330+
* the internal code in java.io.File would normalize it later, creating a new "foo/bar"
331+
* String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File
332+
* uses, since it is in the package-private class java.io.FileSystem.
333+
*/
334+
@VisibleForTesting
335+
static String createNormalizedInternedPathname(String dir1, String dir2, String fname) {
336+
String pathname = dir1 + File.separator + dir2 + File.separator + fname;
337+
Matcher m = MULTIPLE_SEPARATORS.matcher(pathname);
338+
pathname = m.replaceAll("/");
339+
// A single trailing slash needs to be taken care of separately
340+
if (pathname.length() > 1 && pathname.endsWith("/")) {
341+
pathname = pathname.substring(0, pathname.length() - 1);
342+
}
343+
return pathname.intern();
344+
}
345+
318346
/** Simply encodes an executor's full ID, which is appId + execId. */
319347
public static class AppExecId {
320348
public final String appId;

common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
package org.apache.spark.network.shuffle;
1919

20+
import java.io.File;
2021
import java.io.IOException;
2122
import java.io.InputStream;
2223
import java.io.InputStreamReader;
@@ -135,4 +136,23 @@ public void jsonSerializationOfExecutorRegistration() throws IOException {
135136
"\"subDirsPerLocalDir\": 7, \"shuffleManager\": " + "\"" + SORT_MANAGER + "\"}";
136137
assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class));
137138
}
139+
140+
@Test
141+
public void testNormalizeAndInternPathname() {
142+
assertPathsMatch("/foo", "bar", "baz", "/foo/bar/baz");
143+
assertPathsMatch("//foo/", "bar/", "//baz", "/foo/bar/baz");
144+
assertPathsMatch("foo", "bar", "baz///", "foo/bar/baz");
145+
assertPathsMatch("/foo/", "/bar//", "/baz", "/foo/bar/baz");
146+
assertPathsMatch("/", "", "", "/");
147+
assertPathsMatch("/", "/", "/", "/");
148+
}
149+
150+
private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) {
151+
String normPathname =
152+
ExternalShuffleBlockResolver.createNormalizedInternedPathname(p1, p2, p3);
153+
assertEquals(expectedPathname, normPathname);
154+
File file = new File(normPathname);
155+
String returnedPath = file.getPath();
156+
assertTrue(normPathname == returnedPath);
157+
}
138158
}

0 commit comments

Comments
 (0)