Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,11 @@
package org.apache.spark.network.shuffle;

import java.io.File;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.annotations.VisibleForTesting;

import org.apache.commons.lang3.SystemUtils;
import org.apache.spark.network.util.JavaUtils;

public class ExecutorDiskUtils {

private static final Pattern MULTIPLE_SEPARATORS;
static {
if (SystemUtils.IS_OS_WINDOWS) {
MULTIPLE_SEPARATORS = Pattern.compile("[/\\\\]+");
} else {
MULTIPLE_SEPARATORS = Pattern.compile("/{2,}");
}
}

/**
* Hashes a filename into the corresponding local directory, in a manner consistent with
* Spark's DiskBlockManager.getFile().
Expand All @@ -45,34 +31,16 @@ public static File getFile(String[] localDirs, int subDirsPerLocalDir, String fi
int hash = JavaUtils.nonNegativeHash(filename);
String localDir = localDirs[hash % localDirs.length];
int subDirId = (hash / localDirs.length) % subDirsPerLocalDir;
return new File(createNormalizedInternedPathname(
localDir, String.format("%02x", subDirId), filename));
}

/**
* This method is needed to avoid the situation when multiple File instances for the
* same pathname "foo/bar" are created, each with a separate copy of the "foo/bar" String.
* According to measurements, in some scenarios such duplicate strings may waste a lot
* of memory (~ 10% of the heap). To avoid that, we intern the pathname, and before that
* we make sure that it's in a normalized form (contains no "//", "///" etc.) Otherwise,
* the internal code in java.io.File would normalize it later, creating a new "foo/bar"
* String copy. Unfortunately, we cannot just reuse the normalization code that java.io.File
* uses, since it is in the package-private class java.io.FileSystem.
*
* On Windows, separator "\" is used instead of "/".
*
* "\\" is a legal character in path name on Unix-like OS, but illegal on Windows.
*/
@VisibleForTesting
static String createNormalizedInternedPathname(String dir1, String dir2, String fname) {
String pathname = dir1 + File.separator + dir2 + File.separator + fname;
Matcher m = MULTIPLE_SEPARATORS.matcher(pathname);
pathname = m.replaceAll(Matcher.quoteReplacement(File.separator));
// A single trailing slash needs to be taken care of separately
if (pathname.length() > 1 && pathname.charAt(pathname.length() - 1) == File.separatorChar) {
pathname = pathname.substring(0, pathname.length() - 1);
}
return pathname.intern();
final String notNormalizedPath =
localDir + File.separator + String.format("%02x", subDirId) + File.separator + filename;
// Interning the normalized path as according to measurements, in some scenarios such
// duplicate strings may waste a lot of memory (~ 10% of the heap).
// Unfortunately, we cannot just call the normalization code that java.io.File
// uses, since it is in the package-private class java.io.FileSystem.
// So we are creating a File just to get the normalized path back to intern it.
// Finally a new File is built and returned with this interned normalized path.
final String normalizedInternedPath = new File(notNormalizedPath).getPath().intern();
return new File(normalizedInternedPath);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.commons.lang3.builder.ToStringBuilder;
Expand Down Expand Up @@ -71,8 +70,6 @@ public class ExternalShuffleBlockResolver {
private static final String APP_KEY_PREFIX = "AppExecShuffleInfo";
private static final StoreVersion CURRENT_VERSION = new StoreVersion(1, 0);

private static final Pattern MULTIPLE_SEPARATORS = Pattern.compile(File.separator + "{2,}");

// Map containing all registered executors' metadata.
@VisibleForTesting
final ConcurrentMap<AppExecId, ExecutorShuffleInfo> executors;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@

package org.apache.spark.network.shuffle;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.io.CharStreams;
import org.apache.commons.lang3.SystemUtils;
import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo;
import org.apache.spark.network.util.MapConfigProvider;
import org.apache.spark.network.util.TransportConf;
Expand Down Expand Up @@ -145,29 +143,4 @@ public void jsonSerializationOfExecutorRegistration() throws IOException {
assertEquals(shuffleInfo, mapper.readValue(legacyShuffleJson, ExecutorShuffleInfo.class));
}

@Test
public void testNormalizeAndInternPathname() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, @attilapiros . Could you explain why we need to remove the existing test coverage in this Improvement PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun sure, here you are:

The createNormalizedInternedPathname was as good as it was close to java.io.FileSystem#normalize() as in the old code the String interning could only save as any memory when its result was in equal with java.io.FileSystem#normalize() which was called within File constructor. If there was any difference in the string then the path in File would use a different (not interned) string as that string would be transformed a bit more.

When the test was created in the first place if they could call java.io.FileSystem#normalize() somehow within the tests only then they would given asserts where the expected result of createNormalizedInternedPathname would be java.io.FileSystem#normalize() (instead of the hardcoded string paths). The test should check whether the same transformation is done on the incoming path depending on the OS.

So now as we can call indirectly the java.io.FileSystem#normalize() we could rewrite the old test but that would mean having assert where java.io.FileSystem#normalize() is checked whether it is really java.io.FileSystem#normalize(). This would be a trivial assert as it is always true (like an assertTrue(true) just longer). So this is why we do not need the old tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the test existed because Spark has been dealing with normalization by itself (createNormalizedInternedPathname), despite the fact we know File object will do the normalization. Given we get rid of the custom normalization, the test would become whether normalization in JDK File object works properly or not, say, testing JDK functionality, which feels redundant.

If we cannot rely on the JDK implementation then createNormalizedInternedPathname should be just rewritten to the optimized one and this test would then keep as it is, but I'm afraid it's good direction we don't trust JDK implementation.

Copy link
Contributor Author

@attilapiros attilapiros Jul 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but I'm afraid it's good direction we don't trust JDK implementation.

Let's assume JDK introduces a problem then the path is not totally normalized but still that string is interned when you use this PR so you saved the bytes. Your normalized path could be even better java.io.FileSystem#normalize() still the sole purpose of the createNormalizedInternedPathname method is to save heap.

Regarding memory saving you are as good as close to get to java.io.FileSystem#normalize().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@HeartSaVioR HeartSaVioR Jul 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad “if” was missed between “direction” and “we”. Sorry about that. I made clear I have some sort of belief for JDK implementation and the maintenance, otherwise I would suggest to just port the optimized code into here. That said, to avoid further miscommunication, I’m positive on removing test, and I said it even previous comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun Are you OK with the answer? If you're OK with it I'll move this forward.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep~

String sep = File.separator;
String expectedPathname = sep + "foo" + sep + "bar" + sep + "baz";
assertPathsMatch("/foo", "bar", "baz", expectedPathname);
assertPathsMatch("//foo/", "bar/", "//baz", expectedPathname);
assertPathsMatch("/foo/", "/bar//", "/baz", expectedPathname);
assertPathsMatch("foo", "bar", "baz///", "foo" + sep + "bar" + sep + "baz");
assertPathsMatch("/", "", "", sep);
assertPathsMatch("/", "/", "/", sep);
if (SystemUtils.IS_OS_WINDOWS) {
assertPathsMatch("/foo\\/", "bar", "baz", expectedPathname);
} else {
assertPathsMatch("/foo\\/", "bar", "baz", sep + "foo\\" + sep + "bar" + sep + "baz");
}
}

private void assertPathsMatch(String p1, String p2, String p3, String expectedPathname) {
String normPathname =
ExecutorDiskUtils.createNormalizedInternedPathname(p1, p2, p3);
assertEquals(expectedPathname, normPathname);
File file = new File(normPathname);
String returnedPath = file.getPath();
assertEquals(normPathname, returnedPath);
}
}