1818package org .apache .spark .network .shuffle ;
1919
2020import java .io .File ;
21- import java .util .regex .Matcher ;
22- import java .util .regex .Pattern ;
23-
24- import com .google .common .annotations .VisibleForTesting ;
2521
2622import org .apache .spark .network .util .JavaUtils ;
2723
2824public class ExecutorDiskUtils {
2925
30- private static final Pattern MULTIPLE_SEPARATORS = Pattern .compile (File .separator + "{2,}" );
31-
3226 /**
3327 * Hashes a filename into the corresponding local directory, in a manner consistent with
3428 * Spark's DiskBlockManager.getFile().
@@ -37,30 +31,14 @@ public static File getFile(String[] localDirs, int subDirsPerLocalDir, String fi
3731 int hash = JavaUtils .nonNegativeHash (filename );
3832 String localDir = localDirs [hash % localDirs .length ];
3933 int subDirId = (hash / localDirs .length ) % subDirsPerLocalDir ;
40- return new File (createNormalizedInternedPathname (
41- localDir , String .format ("%02x" , subDirId ), filename ));
42- }
43-
44- /**
45- * This method is needed to avoid the situation when multiple File instances for the
46- * same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String.
47- * According to measurements, in some scenarios such duplicate strings may waste a lot
48- * of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that
49- * we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise,
50- * the internal code in java.io.File would normalize it later, creating a new "foo/bar"
51- * String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File
52- * uses, since it is in the package-private class java.io.FileSystem.
53- */
54- @ VisibleForTesting
55- static String createNormalizedInternedPathname (String dir1 , String dir2 , String fname ) {
56- String pathname = dir1 + File .separator + dir2 + File .separator + fname ;
57- Matcher m = MULTIPLE_SEPARATORS .matcher (pathname );
58- pathname = m .replaceAll ("/" );
59- // A single trailing slash needs to be taken care of separately
60- if (pathname .length () > 1 && pathname .endsWith ("/" )) {
61- pathname = pathname .substring (0 , pathname .length () - 1 );
62- }
63- return pathname .intern ();
34+ final String notNormalizedPath =
35+ localDir + File .separator + String .format ("%02x" , subDirId ) + File .separator + filename ;
36+ // Interning the normalized path as according to measurements, in some scenarios such
37+ // duplicate strings may waste a lot of memory (~ 10% of the heap).
38+ // Unfortunately, we cannot just call the normalization code that java.io.File
39+ // uses, since it is in the package-private class java.io.FileSystem.
40+ final String normalizedPath = new File (notNormalizedPath ).getPath ();
41+ return new File (normalizedPath .intern ());
6442 }
6543
6644}
0 commit comments