apache · attilapiros · Jun 30, 2020 · dongjoon-hyun · Jul 7, 2020 · attilapiros
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExecutorDiskUtils.java
@@ -18,25 +18,11 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.File;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
-import com.google.common.annotations.VisibleForTesting;
-
-import org.apache.commons.lang3.SystemUtils;
 import org.apache.spark.network.util.JavaUtils;
 
 public class ExecutorDiskUtils {
 
-  private static final Pattern MULTIPLE_SEPARATORS;
-  static {
-    if (SystemUtils.IS_OS_WINDOWS) {
-      MULTIPLE_SEPARATORS = Pattern.compile("[/\\\\]+");
-    } else {
-      MULTIPLE_SEPARATORS = Pattern.compile("/{2,}");
-    }
-  }
-
   /**
    * Hashes a filename into the corresponding local directory, in a manner consistent with
    * Spark's DiskBlockManager.getFile().
@@ -45,34 +31,16 @@ public static File getFile(String[] localDirs, int subDirsPerLocalDir, String fi
     int hash = JavaUtils.nonNegativeHash(filename);
     String localDir = localDirs[hash % localDirs.length];
     int subDirId = (hash / localDirs.length) % subDirsPerLocalDir;
-    return new File(createNormalizedInternedPathname(
-        localDir, String.format("%02x", subDirId), filename));
-  }
-
-  /**
-   * This method is needed to avoid the situation when multiple File instances for the
-   * same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String.
-   * According to measurements, in some scenarios such duplicate strings may waste a lot
-   * of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that
-   * we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise,
-   * the internal code in java.io.File would normalize it later, creating a new "foo/bar"
-   * String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File
-   * uses, since it is in the package-private class java.io.FileSystem.
-   *
-   * On Windows, separator "\" is used instead of "/".
-   *
-   * "\\" is a legal character in path name on Unix-like OS, but illegal on Windows.
-   */
-  @VisibleForTesting
-  static String createNormalizedInternedPathname(String dir1, String dir2, String fname) {
-    String pathname = dir1 + File.separator + dir2 + File.separator + fname;
-    Matcher m = MULTIPLE_SEPARATORS.matcher(pathname);
-    pathname = m.replaceAll(Matcher.quoteReplacement(File.separator));
-    // A single trailing slash needs to be taken care of separately
-    if (pathname.length() > 1 && pathname.charAt(pathname.length() - 1) == File.separatorChar) {
-      pathname = pathname.substring(0, pathname.length() - 1);
-    }
-    return pathname.intern();
+    final String notNormalizedPath =
+      localDir + File.separator + String.format("%02x", subDirId) + File.separator + filename;
+    // Interning the normalized path as according to measurements, in some scenarios such
+    // duplicate strings may waste a lot of memory (~ 10% of the heap).
+    // Unfortunately, we cannot just call the normalization code that java.io.File
+    // uses, since it is in the package-private class java.io.FileSystem.
+    // So we are creating a File just to get the normalized path back to intern it.
+    // Finally a new File is built and returned with this interned normalized path.
+    final String normalizedInternedPath = new File(notNormalizedPath).getPath().intern();
+    return new File(normalizedInternedPath);
   }
 
 }
diff --git a/...-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/...-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java
@@ -24,7 +24,6 @@
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
-import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import org.apache.commons.lang3.builder.ToStringBuilder;
@@ -71,8 +70,6 @@ public class ExternalShuffleBlockResolver {
   private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
   private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);
 
-  private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}");
-
   // Map containing all registered executors' metadata.
   @VisibleForTesting
   final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;

diff --git a/...fle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/...fle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
@@ -17,15 +17,13 @@
 
 package org.apache.spark.network.shuffle;
 
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.google.common.io.CharStreams;
-import org.apache.commons.lang3.SystemUtils;
 import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
 import org.apache.spark.network.util.MapConfigProvider;
 import org.apache.spark.network.util.TransportConf;
@@ -145,29 +143,4 @@ public void jsonSerializationOfExecutorRegistration() throws IOException {
     assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class));
   }
 
-  @Test
-  public void testNormalizeAndInternPathname() {
-    String sep = File.separator;
-    String expectedPathname = sep + "foo" + sep + "bar" + sep + "baz";
-    assertPathsMatch("/foo", "bar", "baz", expectedPathname);
-    assertPathsMatch("//foo/", "bar/", "//baz", expectedPathname);
-    assertPathsMatch("/foo/", "/bar//", "/baz", expectedPathname);
-    assertPathsMatch("foo", "bar", "baz///", "foo" + sep + "bar" + sep + "baz");
-    assertPathsMatch("/", "", "", sep);
-    assertPathsMatch("/", "/", "/", sep);
-    if (SystemUtils.IS_OS_WINDOWS) {
-      assertPathsMatch("/foo\\/", "bar", "baz", expectedPathname);
-    } else {
-      assertPathsMatch("/foo\\/", "bar", "baz", sep + "foo\\" + sep + "bar" + sep + "baz");
-    }
-  }
-
-  private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) {
-    String normPathname =
-      ExecutorDiskUtils.createNormalizedInternedPathname(p1, p2, p3);
-    assertEquals(expectedPathname, normPathname);
-    File file = new File(normPathname);
-    String returnedPath = file.getPath();
-    assertEquals(normPathname, returnedPath);
-  }
 }