diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
index c26ba60573f8d..06918921fe1d7 100644
--- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java
@@ -521,6 +521,16 @@ protected URI canonicalizeUri(URI rawUri) {
return S3xLoginHelper.canonicalizeUri(rawUri, getDefaultPort());
}
+ /**
+ * Make this protected method public so that {@link S3AGlobber} can access it}.
+ * @param p path
+ * @return the fixed path.
+ */
+ @Override
+ public Path fixRelativePart(Path p) {
+ return super.fixRelativePart(p);
+ }
+
/**
* Opens an FSDataInputStream at the indicated Path.
* @param f the file name to open
@@ -2114,23 +2124,42 @@ int getMaxKeys() {
}
/**
+ * Override superclass to use the new {@code S3AGlobber}.
* Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
* {@inheritDoc}
*/
@Override
public FileStatus[] globStatus(Path pathPattern) throws IOException {
- incrementStatistic(INVOCATION_GLOB_STATUS);
- return super.globStatus(pathPattern);
+ return globStatus(pathPattern, ACCEPT_ALL);
}
/**
- * Override superclass so as to add statistic collection.
+ * Override superclass to use the new {@code S3AGlobber}.
+ * Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
* {@inheritDoc}
*/
@Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
throws IOException {
incrementStatistic(INVOCATION_GLOB_STATUS);
+ return new S3AGlobber(this, pathPattern, filter).glob();
+ }
+
+ /**
+ * Invoke the base {@link FileSystem#globStatus(Path, PathFilter)}.
+ * This is purely to allow tests to compare both the performance
+ * and results of the glob operation.
+ * Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
+ *
+ * For Testing only.
+ * @param pathPattern a regular expression specifying the path pattern
+ * @param filter a user-supplied path filter
+ * @return an array of FileStatus objects
+ * @throws IOException if any I/O error occurs when fetching file status
+ */
+ public FileStatus[] globStatusClassic(Path pathPattern, PathFilter filter)
+ throws IOException {
+ incrementStatistic(INVOCATION_GLOB_STATUS);
return super.globStatus(pathPattern, filter);
}
diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AGlobber.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AGlobber.java
new file mode 100644
index 0000000000000..aebe0f4f1cd0f
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AGlobber.java
@@ -0,0 +1,291 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs.s3a;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FsTracer;
+import org.apache.hadoop.fs.GlobExpander;
+import org.apache.hadoop.fs.GlobFilter;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.htrace.core.TraceScope;
+import org.apache.htrace.core.Tracer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * A special variant of the {@code Globber} class, designed for S3A performance.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+class S3AGlobber {
+ public static final Logger LOG = LoggerFactory.getLogger(S3AGlobber.class);
+
+ private final S3AFileSystem fs;
+ private final Path pathPattern;
+ private final PathFilter filter;
+ private final Tracer tracer;
+
+ S3AGlobber(S3AFileSystem fs, Path pathPattern, PathFilter filter) {
+ this.fs = fs;
+ this.pathPattern = pathPattern;
+ this.filter = filter;
+ this.tracer = FsTracer.get(fs.getConf());
+ }
+
+ private FileStatus getFileStatus(Path path) throws IOException {
+ try {
+ return fs.getFileStatus(path);
+ } catch (FileNotFoundException e) {
+ return null;
+ }
+ }
+
+ private FileStatus[] listStatus(Path path) throws IOException {
+ try {
+ return fs.listStatus(path);
+ } catch (FileNotFoundException e) {
+ return new FileStatus[0];
+ }
+ }
+
+ private Path fixRelativePart(Path path) {
+ return fs.fixRelativePart(path);
+ }
+
+ /**
+ * Convert a path component that contains backslash escape sequences to a
+ * literal string. This is necessary when you want to explicitly refer to a
+ * path that contains globber metacharacters.
+ */
+ private static String unescapePathComponent(String name) {
+ return name.replaceAll("\\\\(.)", "$1");
+ }
+
+ /**
+ * Translate an absolute path into a list of path components.
+ * We merge double slashes into a single slash here.
+ * POSIX root path, i.e. '/', does not get an entry in the list.
+ */
+ private static List getPathComponents(String path)
+ throws IOException {
+ ArrayList ret = new ArrayList<>();
+ for (String component : path.split(Path.SEPARATOR)) {
+ if (!component.isEmpty()) {
+ ret.add(component);
+ }
+ }
+ return ret;
+ }
+
+ private String schemeFromPath(Path path) throws IOException {
+ String scheme = path.toUri().getScheme();
+ if (scheme == null) {
+ scheme = fs.getUri().getScheme();
+ }
+ return scheme;
+ }
+
+ private String authorityFromPath(Path path) throws IOException {
+ String authority = path.toUri().getAuthority();
+ if (authority == null) {
+ if (fs != null) {
+ authority = fs.getUri().getAuthority();
+ }
+ }
+ return authority;
+ }
+
+ public FileStatus[] glob() throws IOException {
+ try(TraceScope scope = tracer.newScope("S3AGlobber#glob")) {
+ scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
+ return doGlob();
+ }
+ }
+
+ private FileStatus[] doGlob() throws IOException {
+ // First we get the scheme and authority of the pattern that was passed
+ // in.
+ String scheme = schemeFromPath(pathPattern);
+ String authority = authorityFromPath(pathPattern);
+
+ // Next we strip off everything except the pathname itself, and expand all
+ // globs. Expansion is a process which turns "grouping" clauses,
+ // expressed as brackets, into separate path patterns.
+ String pathPatternString = pathPattern.toUri().getPath();
+ List flattenedPatterns = GlobExpander.expand(pathPatternString);
+
+ // Now loop over all flattened patterns. In every case, we'll be trying to
+ // match them to entries in the filesystem.
+ ArrayList results =
+ new ArrayList<>(flattenedPatterns.size());
+
+ boolean sawWildcard = false;
+ for (String flatPattern : flattenedPatterns) {
+ // Get the absolute path for this flattened pattern. We couldn't do
+ // this prior to flattening because of patterns like {/,a}, where which
+ // path you go down influences how the path must be made absolute.
+ Path absPattern = fixRelativePart(new Path(
+ flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
+ // Now we break the flattened, absolute pattern into path components.
+ // For example, /a/*/c would be broken into the list [a, *, c]
+ List components =
+ getPathComponents(absPattern.toUri().getPath());
+ // Starting out at the root of the filesystem, we try to match
+ // filesystem entries against pattern components.
+ ArrayList candidates = new ArrayList<>(1);
+ // To get the "real" FileStatus of root, we'd have to do an expensive
+ // RPC to the NameNode. So we create a placeholder FileStatus which has
+ // the correct path, but defaults for the rest of the information.
+ // Later, if it turns out we actually want the FileStatus of root, we'll
+ // replace the placeholder with a real FileStatus obtained from the
+ // NameNode.
+ FileStatus rootPlaceholder;
+ if (Path.WINDOWS && !components.isEmpty()
+ && Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
+ // On Windows the path could begin with a drive letter, e.g. /E:/foo.
+ // We will skip matching the drive letter and start from listing the
+ // root of the filesystem on that drive.
+ String driveLetter = components.remove(0);
+ rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
+ authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
+ } else {
+ rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
+ new Path(scheme, authority, Path.SEPARATOR));
+ }
+ candidates.add(rootPlaceholder);
+
+ for (int componentIdx = 0; componentIdx < components.size();
+ componentIdx++) {
+ ArrayList newCandidates =
+ new ArrayList<>(candidates.size());
+ GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
+ String component = unescapePathComponent(components.get(componentIdx));
+ if (globFilter.hasPattern()) {
+ sawWildcard = true;
+ }
+ if (candidates.isEmpty() && sawWildcard) {
+ // Optimization: if there are no more candidates left, stop examining
+ // the path components. We can only do this if we've already seen
+ // a wildcard component-- otherwise, we still need to visit all path
+ // components in case one of them is a wildcard.
+ break;
+ }
+ if ((componentIdx < components.size() - 1) &&
+ (!globFilter.hasPattern())) {
+ // Optimization: if this is not the terminal path component, and we
+ // are not matching against a glob, assume that it exists. If it
+ // doesn't exist, we'll find out later when resolving a later glob
+ // or the terminal path component.
+ for (FileStatus candidate : candidates) {
+ candidate.setPath(new Path(candidate.getPath(), component));
+ }
+ continue;
+ }
+ for (FileStatus candidate : candidates) {
+ if (globFilter.hasPattern()) {
+ FileStatus[] children = listStatus(candidate.getPath());
+ if (children.length == 1) {
+ // If we get back only one result, this could be either a listing
+ // of a directory with one entry, or it could reflect the fact
+ // that what we listed resolved to a file.
+ //
+ // Unfortunately, we can't just compare the returned paths to
+ // figure this out. Consider the case where you have /a/b, where
+ // b is a symlink to "..". In that case, listing /a/b will give
+ // back "/a/b" again. If we just went by returned pathname, we'd
+ // incorrectly conclude that /a/b was a file and should not match
+ // /a/*/*. So we use getFileStatus of the path we just listed to
+ // disambiguate.
+ if (!getFileStatus(candidate.getPath()).isDirectory()) {
+ continue;
+ }
+ }
+ for (FileStatus child : children) {
+ if (componentIdx < components.size() - 1) {
+ // Don't try to recurse into non-directories. See HADOOP-10957.
+ if (!child.isDirectory()) continue;
+ }
+ // Set the child path based on the parent path.
+ child.setPath(new Path(candidate.getPath(),
+ child.getPath().getName()));
+ if (globFilter.accept(child.getPath())) {
+ newCandidates.add(child);
+ }
+ }
+ } else {
+ // When dealing with non-glob components, use getFileStatus
+ // instead of listStatus. This is an optimization, but it also
+ // is necessary for correctness in HDFS, since there are some
+ // special HDFS directories like .reserved and .snapshot that are
+ // not visible to listStatus, but which do exist. (See HADOOP-9877)
+ FileStatus childStatus = getFileStatus(
+ new Path(candidate.getPath(), component));
+ if (childStatus != null) {
+ newCandidates.add(childStatus);
+ }
+ }
+ }
+ candidates = newCandidates;
+ }
+ for (FileStatus status : candidates) {
+ // Use object equality to see if this status is the root placeholder.
+ // See the explanation for rootPlaceholder above for more information.
+ if (status == rootPlaceholder) {
+ status = getFileStatus(rootPlaceholder.getPath());
+ if (status == null) continue;
+ }
+ // HADOOP-3497 semantics: the user-defined filter is applied at the
+ // end, once the full path is built up.
+ if (filter.accept(status.getPath())) {
+ results.add(status);
+ }
+ }
+ }
+ /*
+ * When the input pattern "looks" like just a simple filename, and we
+ * can't find it, we return null rather than an empty array.
+ * This is a special case which the shell relies on.
+ *
+ * To be more precise: if there were no results, AND there were no
+ * groupings (aka brackets), and no wildcards in the input (aka stars),
+ * we return null.
+ */
+ if ((!sawWildcard) && results.isEmpty() &&
+ (flattenedPatterns.size() <= 1)) {
+ return null;
+ }
+ /*
+ * In general, the results list will already be sorted, since listStatus
+ * returns results in sorted order for many Hadoop filesystems. However,
+ * not all Hadoop filesystems have this property. So we sort here in order
+ * to get consistent results. See HADOOP-10798 for details.
+ */
+ FileStatus ret[] = results.toArray(new FileStatus[results.size()]);
+ Arrays.sort(ret);
+ return ret;
+ }
+}
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AGlobPaths.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AGlobPaths.java
new file mode 100644
index 0000000000000..f842500e97ddd
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/TestS3AGlobPaths.java
@@ -0,0 +1,944 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs.s3a;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Ordering;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSTestWrapper;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileSystemTestWrapper;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.fs.TestPath;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.UUID;
+import java.util.regex.Pattern;
+
+/**
+ * Globbing test. These tests include patterns relative to the user
+ * home directory, so cannot be parallelized with any other test which
+ * do that.
+ */
+public class TestS3AGlobPaths extends AbstractS3ATestBase {
+
+ static class RegexPathFilter implements PathFilter {
+
+ private final String regex;
+
+ public RegexPathFilter(String regex) {
+ this.regex = regex;
+ }
+
+ @Override
+ public boolean accept(Path path) {
+ return path.toString().matches(regex);
+ }
+
+ }
+
+ private S3AFileSystem fs;
+ private static final int NUM_OF_PATHS = 4;
+ private Path userPath;
+ private String userDir;
+ private String userDirQuoted;
+
+ private final Path[] path = new Path[NUM_OF_PATHS];
+
+ @Override
+ public void setup() throws Exception {
+ super.setup();
+ fs = getFileSystem();
+ userPath = fs.getHomeDirectory();
+ userDir = userPath.toUri().getPath().toString();
+ userDirQuoted = Pattern.quote(userDir);
+ }
+
+ @After
+ public void cleanupFS() throws IOException {
+ if (fs != null) {
+ fs.delete(userPath, true);
+ }
+ }
+
+ @Test
+ public void testMultiGlob() throws IOException {
+ FileStatus[] status;
+ /*
+ * /dir1/subdir1
+ * /dir1/subdir1/f1
+ * /dir1/subdir1/f2
+ * /dir1/subdir2/f1
+ * /dir2/subdir1
+ * /dir2/subdir2
+ * /dir2/subdir2/f1
+ * /dir3/f1
+ * /dir3/f1
+ * /dir3/f2(dir)
+ * /dir3/subdir2(file)
+ * /dir3/subdir3
+ * /dir3/subdir3/f1
+ * /dir3/subdir3/f1/f1
+ * /dir3/subdir3/f3
+ * /dir4
+ */
+
+ Path d1 = new Path(userDir, "dir1");
+ Path d11 = new Path(d1, "subdir1");
+ Path d12 = new Path(d1, "subdir2");
+
+ Path f111 = new Path(d11, "f1");
+ fs.createNewFile(f111);
+ Path f112 = new Path(d11, "f2");
+ fs.createNewFile(f112);
+ Path f121 = new Path(d12, "f1");
+ fs.createNewFile(f121);
+
+ Path d2 = new Path(userDir, "dir2");
+ Path d21 = new Path(d2, "subdir1");
+ fs.mkdirs(d21);
+ Path d22 = new Path(d2, "subdir2");
+ Path f221 = new Path(d22, "f1");
+ fs.createNewFile(f221);
+
+ Path d3 = new Path(userDir, "dir3");
+ Path f31 = new Path(d3, "f1");
+ fs.createNewFile(f31);
+ Path d32 = new Path(d3, "f2");
+ fs.mkdirs(d32);
+ Path f32 = new Path(d3, "subdir2"); // fake as a subdir!
+ fs.createNewFile(f32);
+ Path d33 = new Path(d3, "subdir3");
+ Path f333 = new Path(d33, "f3");
+ fs.createNewFile(f333);
+ Path d331 = new Path(d33, "f1");
+ Path f3311 = new Path(d331, "f1");
+ fs.createNewFile(f3311);
+ Path d4 = new Path(userDir, "dir4");
+ fs.mkdirs(d4);
+
+ /*
+ * basic
+ */
+ Path root = userPath;
+ checkStatus(fs.globStatus(root), root);
+
+ status = fs.globStatus(new Path(userDir, "x"));
+ assertNull(status);
+
+ status = fs.globStatus(new Path("x"));
+ assertNull(status);
+
+ status = fs.globStatus(new Path(userDir, "x/x"));
+ assertNull(status);
+
+ status = fs.globStatus(new Path("x/x"));
+ assertNull(status);
+
+ status = fs.globStatus(new Path(userDir, "*"));
+ checkStatus(status, d1, d2, d3, d4);
+
+ status = fs.globStatus(new Path("*"));
+ checkStatus(status, d1, d2, d3, d4);
+
+ status = fs.globStatus(new Path(userDir, "*/x"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("*/x"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userDir, "x/*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("x/*"));
+ checkStatus(status);
+
+ // make sure full pattern is scanned instead of bailing early with undef
+ status = fs.globStatus(new Path(userDir, "x/x/x/*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("x/x/x/*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userDir, "*/*"));
+ checkStatus(status, d11, d12, d21, d22, f31, d32, f32, d33);
+
+ status = fs.globStatus(new Path("*/*"));
+ checkStatus(status, d11, d12, d21, d22, f31, d32, f32, d33);
+
+ /*
+ * one level deep
+ */
+ status = fs.globStatus(new Path(userDir, "dir*/*"));
+ checkStatus(status, d11, d12, d21, d22, f31, d32, f32, d33);
+
+ status = fs.globStatus(new Path("dir*/*"));
+ checkStatus(status, d11, d12, d21, d22, f31, d32, f32, d33);
+
+ status = fs.globStatus(new Path(userDir, "dir*/subdir*"));
+ checkStatus(status, d11, d12, d21, d22, f32, d33);
+
+ status = fs.globStatus(new Path("dir*/subdir*"));
+ checkStatus(status, d11, d12, d21, d22, f32, d33);
+
+ status = fs.globStatus(new Path(userDir, "dir*/f*"));
+ checkStatus(status, f31, d32);
+
+ status = fs.globStatus(new Path("dir*/f*"));
+ checkStatus(status, f31, d32);
+
+ /*
+ * subdir1 globs
+ */
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1"));
+ checkStatus(status, d11, d21);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/*"));
+ checkStatus(status, f111, f112);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/*/*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/x"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/x*"));
+ checkStatus(status);
+
+ /*
+ * subdir2 globs
+ */
+ status = fs.globStatus(new Path(userPath, "dir*/subdir2"));
+ checkStatus(status, d12, d22, f32);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir2/*"));
+ checkStatus(status, f121, f221);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir2/*/*"));
+ checkStatus(status);
+
+ /*
+ * subdir3 globs
+ */
+ status = fs.globStatus(new Path(userPath, "dir*/subdir3"));
+ checkStatus(status, d33);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir3/*"));
+ checkStatus(status, d331, f333);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir3/*/*"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir3/*/*/*"));
+ checkStatus(status);
+
+ /*
+ * file1 single dir globs
+ */
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f1"));
+ checkStatus(status, f111);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f1*"));
+ checkStatus(status, f111);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f1/*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f1*/*"));
+ checkStatus(status);
+
+ /*
+ * file1 multi-dir globs
+ */
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1"));
+ checkStatus(status, f111, f121, f221, d331);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1*"));
+ checkStatus(status, f111, f121, f221, d331);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1/*"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1*/*"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1*/*"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1*/x"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f1*/*/*"));
+ checkStatus(status);
+
+ /*
+ * file glob multiple files
+ */
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*"));
+ checkStatus(status, d11, d12, d21, d22, f32, d33);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/*"));
+ checkStatus(status, f111, f112, f121, f221, d331, f333);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f*"));
+ checkStatus(status, f111, f112, f121, f221, d331, f333);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/f*/*"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/*/f1"));
+ checkStatus(status, f3311);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir*/*/*"));
+ checkStatus(status, f3311);
+
+ // doesn't exist
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f3"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(userPath, "dir*/subdir1/f3*"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("{x}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("{x,y}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("dir*/{x,y}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("dir*/{f1,y}"));
+ checkStatus(status, f31);
+
+ status = fs.globStatus(new Path("{x,y}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("/{x/x,y/y}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("{x/x,y/y}"));
+ checkStatus(status);
+
+ status = fs.globStatus(new Path(Path.CUR_DIR));
+ checkStatus(status, userPath);
+
+ status = fs.globStatus(new Path(userDir + "{/dir1}"));
+ checkStatus(status, d1);
+
+ status = fs.globStatus(new Path(userDir + "{/dir*}"));
+ checkStatus(status, d1, d2, d3, d4);
+
+ status = fs.globStatus(new Path(Path.SEPARATOR), trueFilter);
+ checkStatus(status, new Path(Path.SEPARATOR));
+
+ status = fs.globStatus(new Path(Path.CUR_DIR), trueFilter);
+ checkStatus(status, userPath);
+
+ status = fs.globStatus(d1, trueFilter);
+ checkStatus(status, d1);
+
+ status = fs.globStatus(userPath, trueFilter);
+ checkStatus(status, userPath);
+
+ status = fs.globStatus(new Path(userPath, "*"), trueFilter);
+ checkStatus(status, d1, d2, d3, d4);
+
+ status = fs.globStatus(new Path("/x/*"), trueFilter);
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("/x"), trueFilter);
+ assertNull(status);
+
+ status = fs.globStatus(new Path("/x/x"), trueFilter);
+ assertNull(status);
+
+ /*
+ * false filter
+ */
+ PathFilter falseFilter = new PathFilter() {
+ @Override
+ public boolean accept(Path p) {
+ return false;
+ }
+ };
+
+ status = fs.globStatus(new Path(Path.SEPARATOR), falseFilter);
+ assertNull(status);
+
+ status = fs.globStatus(new Path(Path.CUR_DIR), falseFilter);
+ assertNull(status);
+
+ status = fs.globStatus(userPath, falseFilter);
+ assertNull(status);
+
+ status = fs.globStatus(new Path(userPath, "*"), falseFilter);
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("/x/*"), falseFilter);
+ checkStatus(status);
+
+ status = fs.globStatus(new Path("/x"), falseFilter);
+ assertNull(status);
+
+ status = fs.globStatus(new Path("/x/x"), falseFilter);
+ assertNull(status);
+
+ }
+
+ private void checkStatus(FileStatus[] status, Path... expectedMatches) {
+ assertNotNull(status);
+ String[] paths = new String[status.length];
+ for (int i = 0; i < status.length; i++) {
+ paths[i] = getPathFromStatus(status[i]);
+ }
+ // get the sort paths of the expected values
+ String[] matches = new String[expectedMatches.length];
+ for (int i = 0; i < expectedMatches.length; i++) {
+ matches[i] = expectedMatches[i].toUri().getPath();
+ }
+ String got = StringUtils.join(paths, "\n");
+ String expected = StringUtils.join(matches, "\n");
+ assertEquals(expected, got);
+ }
+
+ private String getPathFromStatus(FileStatus status) {
+// return status.getPath().toUri().toString();
+ return status.getPath().toUri().getPath();
+ }
+
+ @Test
+ public void testPathFilter() throws IOException {
+ String[] files = {
+ userDir + "/a",
+ userDir + "/a/b"
+ };
+ Path[] matchedPath = prepareTesting(
+ userDir + "/*/*", files,
+ new RegexPathFilter("^.*" + userDirQuoted + "/a/b"));
+ assertEquals(1, matchedPath.length);
+ assertEquals(path[1], matchedPath[0]);
+ }
+
+ @Test
+ public void testPathFilterWithFixedLastComponent() throws IOException {
+ String[] files = {
+ userDir + "/a",
+ userDir + "/a/b",
+ userDir + "/c",
+ userDir + "/c/b",
+ };
+ Path[] matchedPath = prepareTesting(userDir + "/*/b", files,
+ new RegexPathFilter("^.*" + userDirQuoted + "/a/b"));
+ assertEquals(1, matchedPath.length);
+ assertEquals(path[1], matchedPath[0]);
+ }
+
+ @Test
+ public void pTestLiteral() throws IOException {
+ String[] files = {
+ userDir + "/a2c",
+ userDir + "/abc.d"
+ };
+ assertMatchOperation(userDir + "/abc.d", files, 1);
+ }
+
+ @Test
+ public void pTestEscape() throws IOException {
+ String[] files = {
+ userDir + "/ab\\[c.d"
+ };
+ assertMatchOperation(userDir + "/ab\\[c.d", files, 0);
+ }
+
+ @Test
+ public void pTestAny() throws IOException {
+ String[] files = {
+ userDir + "/abc",
+ userDir + "/a2c",
+ userDir + "/a.c",
+ userDir + "/abcd"
+ };
+ assertMatchOperation(userDir + "/a?c", files, 2, 1, 0);
+ }
+
+ @Test
+ public void pTestClosure1() throws IOException {
+ String[] files = {
+ userDir + "/a",
+ userDir + "/abc",
+ userDir + "/abc.p",
+ userDir + "/bacd"
+ };
+ assertMatchOperation(userDir + "/a*", files, 0, 1, 2);
+ }
+
+ @Test
+ public void pTestClosure2() throws IOException {
+ String[] files = {
+ userDir + "/a.",
+ userDir + "/a.txt",
+ userDir + "/a.old.java",
+ userDir + "/.java"
+ };
+ assertMatchOperation(userDir + "/a.*", files, 0, 2, 1);
+ }
+
+ @Test
+ public void pTestClosure3() throws IOException {
+ String[] files = {
+ userDir + "/a.txt.x",
+ userDir + "/ax",
+ userDir + "/ab37x",
+ userDir + "/bacd"
+ };
+ assertMatchOperation(userDir + "/a*x", files, 0, 2, 1);
+ }
+
+ @Test
+ public void pTestClosure4() throws IOException {
+ String[] files = {
+ userDir + "/dir1/file1",
+ userDir + "/dir2/file2",
+ userDir + "/dir3/file1"
+ };
+ assertMatchOperation(userDir + "/*/file1", files, 0, 2);
+ }
+
+ @Test
+ public void pTestClosure5() throws IOException {
+ String[] files = {
+ userDir + "/dir1/file1",
+ userDir + "/file1"
+ };
+ assertMatchOperation(userDir + "/*/file1", files, 0);
+ }
+
+ @Test
+ public void pTestSet() throws IOException {
+ String[] files = {
+ userDir + "/a.c",
+ userDir + "/a.cpp",
+ userDir + "/a.hlp",
+ userDir + "/a.hxy"
+ };
+ assertMatchOperation(userDir + "/a.[ch]??", files, 1, 2, 3);
+ }
+
+ @Test
+ public void pTestRange() throws IOException {
+ String[] files = {
+ userDir + "/a.d",
+ userDir + "/a.e",
+ userDir + "/a.f",
+ userDir + "/a.h"
+ };
+ assertMatchOperation(userDir + "/a.[d-fm]", files,
+ 0, 1, 2);
+ }
+
+ @Test
+ public void pTestSetExcl() throws IOException {
+ String[] files = {
+ userDir + "/a.d",
+ userDir + "/a.e",
+ userDir + "/a.0",
+ userDir + "/a.h"
+ };
+ assertMatchOperation(userDir + "/a.[^a-cg-z0-9]", files, 0, 1);
+ }
+
+ @Test
+ public void pTestCombination() throws IOException {
+ String[] files = {
+ "/user/aa/a.c",
+ "/user/bb/a.cpp",
+ "/user1/cc/b.hlp",
+ "/user/dd/a.hxy"
+ };
+ assertMatchOperation("/use?/*/a.[ch]{lp,xy}", files, 3);
+ }
+
+ /* Test {xx,yy} */
+ @Test
+ public void pTestCurlyBracket() throws IOException {
+ String[] files = {
+ userDir + "/a.abcxx",
+ userDir + "/a.abxy",
+ userDir + "/a.hlp",
+ userDir + "/a.jhyy"
+ };
+ assertMatchOperation(userDir + "/a.{abc,jh}??", files, 0, 3);
+ }
+
+ @Test
+ public void testNestedCurlyBracket() throws Throwable {
+ String[] files = {
+ userDir + "/a.abcxx",
+ userDir + "/a.abdxy",
+ userDir + "/a.hlp",
+ userDir + "/a.jhyy"
+ };
+ assertMatchOperation(userDir + "/a.{ab{c,d},jh}??", files, 0, 1, 3);
+ }
+
+ @Test
+ public void testCrossComponentCurlyBrackets() throws Throwable {
+ // cross-component curlies
+ String[] files = {
+ userDir + "/a/b",
+ userDir + "/a/d",
+ userDir + "/c/b",
+ userDir + "/c/d"
+ };
+ assertMatchOperation(userDir + "/{a/b,c/d}", files, 0, 3);
+ }
+
+ public Path[] assertMatchOperation(String pattern,
+ String[] files,
+ int... matchIndices)
+ throws IOException {
+ Path[] matchedPaths = prepareTesting(pattern, files);
+ int expectedLength = matchIndices.length;
+ StringBuilder builder = new StringBuilder(
+ expectedLength * 128);
+ builder.append("Expected Paths\n");
+ for (int index : matchIndices) {
+ if (index < path.length) {
+ builder.append(
+ String.format(" [%d] %s\n", index, path[index]));
+ }
+ }
+ Joiner j = Joiner.on("\n ");
+ builder.append("\nMatched paths:\n ");
+ j.appendTo(builder, matchedPaths);
+ assertEquals(builder.toString(), expectedLength, matchedPaths.length);
+ for (int i = 0; i < matchedPaths.length; i++) {
+ int expectedIndex = matchIndices[i];
+ Path expectedPath = path[expectedIndex];
+ assertEquals(String.format("Element %d: in %s", i, builder.toString()),
+ expectedPath, matchedPaths[i]);
+ }
+ return matchedPaths;
+ }
+
+ // cross-component absolute curlies
+ @Test
+ public void testCrossComponentAbsoluteCurlyBrackets() throws Throwable {
+ // cross-component curlies
+ String[] files = {
+ "/a/b",
+ "/a/d",
+ "/c/b",
+ "/c/d"
+ };
+ assertMatchOperation("{/a/b,/c/d}", files, 0, 3);
+ }
+
+ @Test
+ public void testStandalone() throws Throwable {
+ String[] files = {
+ userDir + "/}bc",
+ userDir + "/}c"
+ };
+ assertMatchOperation(userDir + "/}{a,b}c", files, 0);
+ // test {b}
+ assertMatchOperation(userDir + "/}{b}c", files, 0);
+ // test {}
+ assertMatchOperation(userDir + "/}{}bc", files, 0);
+
+ // test {,}
+ assertMatchOperation(userDir + "/}{,}bc", files, 0);
+
+ // test {b,}
+ assertMatchOperation(userDir + "/}{b,}c", files, 0, 1);
+
+ // test {,b}
+ assertMatchOperation(userDir + "/}{,b}c", files, 0, 1);
+
+ // test a combination of {} and ?
+ assertMatchOperation(userDir + "/}{ac,?}", files, 1);
+
+ // test ill-formed curly
+ try {
+ prepareTesting(userDir + "}{bc", files);
+ fail("Expected exception");
+ } catch (IOException e) {
+ GenericTestUtils.assertExceptionContains("Illegal file pattern:", e);
+ }
+ }
+
+ /* test that a path name can contain Java regex special characters */
+ @Test
+ public void pTestJavaRegexSpecialChars() throws IOException {
+ String[] files = {userDir + "/($.|+)bc", userDir + "/abc"};
+ assertMatchOperation(userDir + "/($.|+)*", files, 0);
+
+ }
+
+ private Path[] prepareTesting(String pattern, String[] files)
+ throws IOException {
+ buildPaths(files);
+ Path patternPath = new Path(pattern);
+ Path[] globResults = FileUtil.stat2Paths(fs.globStatus(patternPath),
+ patternPath);
+ for (int i = 0; i < globResults.length; i++) {
+ globResults[i] =
+ globResults[i].makeQualified(fs.getUri(), fs.getWorkingDirectory());
+ }
+ return globResults;
+ }
+
+ private void buildPaths(String[] files) throws IOException {
+ for (int i = 0; i < Math.min(NUM_OF_PATHS, files.length); i++) {
+ path[i] = fs.makeQualified(new Path(files[i]));
+ if (!fs.mkdirs(path[i])) {
+ throw new IOException("Mkdirs failed to create " + path[i].toString());
+ }
+ }
+ }
+
+ private Path[] prepareTesting(String pattern, String[] files,
+ PathFilter filter) throws IOException {
+ buildPaths(files);
+ Path patternPath = new Path(pattern);
+ Path[] globResults = FileUtil.stat2Paths(fs.globStatus(patternPath, filter),
+ patternPath);
+ for (int i = 0; i < globResults.length; i++) {
+ globResults[i] =
+ globResults[i].makeQualified(fs.getUri(), fs.getWorkingDirectory());
+ }
+ return globResults;
+ }
+
+ /**
+ * A glob test that can be run on either FileContext or FileSystem.
+ */
+ private abstract class FSTestWrapperGlobTest {
+ FSTestWrapperGlobTest(boolean useFc) {
+ this.wrap = new FileSystemTestWrapper(fs);
+ }
+
+ abstract void run() throws Exception;
+
+ final FSTestWrapper wrap;
+ }
+
+ /**
+ * Run a glob test on FileContext.
+ */
+ private void testOnFileSystem(FSTestWrapperGlobTest test) throws Exception {
+ fs.mkdirs(userPath);
+ test.run();
+ }
+
+ /**
+ * Accept all paths.
+ */
+ private static class AcceptAllPathFilter implements PathFilter {
+ @Override
+ public boolean accept(Path path) {
+ return true;
+ }
+ }
+
+ private static final PathFilter trueFilter = new AcceptAllPathFilter();
+
+ /**
+ * Accept only paths ending in Z.
+ */
+ private static class AcceptPathsEndingInZ implements PathFilter {
+ @Override
+ public boolean accept(Path path) {
+ String stringPath = path.toUri().getPath();
+ return stringPath.endsWith("z");
+ }
+ }
+
+ /**
+ * Test that globStatus fills in the scheme even when it is not provided.
+ */
+ private class TestGlobFillsInScheme extends FSTestWrapperGlobTest {
+ TestGlobFillsInScheme(boolean useFc) {
+ super(useFc);
+ }
+
+ void run() throws Exception {
+ // Verify that the default scheme is hdfs, when we don't supply one.
+ wrap.mkdir(new Path(userPath, "/alpha"), FsPermission.getDirDefault(),
+ false);
+ wrap.createSymlink(new Path(userPath, "/alpha"), new Path(userPath
+ + "/alphaLink"), false);
+ FileStatus statuses[] = wrap.globStatus(
+ new Path(userPath, "/alphaLink"), new AcceptAllPathFilter());
+ Assert.assertEquals(1, statuses.length);
+ Path p = statuses[0].getPath();
+ Assert.assertEquals(userDir + "/alpha", p.toUri().getPath());
+ Assert.assertEquals("hdfs", p.toUri().getScheme());
+
+ }
+ }
+
+ @Test
+ @Ignore
+ public void testGlobFillsInSchemeOnFS() throws Exception {
+ testOnFileSystem(new TestGlobFillsInScheme(false));
+ }
+
+ /**
+ * Test that globStatus works with relative paths.
+ **/
+ private class TestRelativePath extends FSTestWrapperGlobTest {
+ TestRelativePath(boolean useFc) {
+ super(useFc);
+ }
+
+ void run() throws Exception {
+ String[] files = {"a", "abc", "abc.p", "bacd"};
+
+ Path[] p = new Path[files.length];
+ for (int i = 0; i < files.length; i++) {
+ p[i] = wrap.makeQualified(new Path(files[i]));
+ wrap.mkdir(p[i], FsPermission.getDirDefault(), true);
+ }
+
+ Path patternPath = new Path("a*");
+ Path[] globResults = FileUtil.stat2Paths(wrap.globStatus(patternPath,
+ new AcceptAllPathFilter()),
+ patternPath);
+
+ for (int i = 0; i < globResults.length; i++) {
+ globResults[i] = wrap.makeQualified(globResults[i]);
+ }
+
+ assertEquals(3, globResults.length);
+
+ // The default working directory for FileSystem is the user's home
+ // directory. For FileContext, the default is based on the UNIX user that
+ // started the jvm. This is arguably a bug (see HADOOP-10944 for
+ // details). We work around it here by explicitly calling
+ // getWorkingDirectory and going from there.
+ String pwd = wrap.getWorkingDirectory().toUri().getPath();
+ assertEquals(pwd + "/a;" + pwd + "/abc;" + pwd + "/abc.p",
+ TestPath.mergeStatuses(globResults));
+ }
+ }
+
+ @Test
+ public void testRelativePathOnFS() throws Exception {
+ testOnFileSystem(new TestRelativePath(false));
+ }
+
+ /**
+ * Test trying to glob the root. Regression test for HDFS-5888.
+ **/
+ private class TestGlobRoot extends FSTestWrapperGlobTest {
+ TestGlobRoot(boolean useFc) {
+ super(useFc);
+ }
+
+ void run() throws Exception {
+ final Path rootPath = new Path("/");
+ FileStatus oldRootStatus = wrap.getFileStatus(rootPath);
+ FileStatus[] status =
+ wrap.globStatus(rootPath, new AcceptAllPathFilter());
+ Assert.assertEquals(1, status.length);
+ // TODO: Add any way to check that this is a real status entry, not a fake one.
+ }
+ }
+
+ @Test
+ public void testGlobRootOnFS() throws Exception {
+ testOnFileSystem(new TestGlobRoot(false));
+ }
+
+ /**
+ * Test glob expressions that don't appear at the end of the path. Regression
+ * test for HADOOP-10957.
+ **/
+ private class TestNonTerminalGlobs extends FSTestWrapperGlobTest {
+ TestNonTerminalGlobs(boolean useFc) {
+ super(useFc);
+ }
+
+ void run() throws Exception {
+ try {
+ fs.mkdirs(new Path("/filed_away/alpha"));
+ wrap.createFile(new Path("/filed"), 0);
+ FileStatus[] statuses =
+ wrap.globStatus(new Path("/filed*/alpha"),
+ new AcceptAllPathFilter());
+ Assert.assertEquals(1, statuses.length);
+ Assert.assertEquals("/filed_away/alpha", statuses[0].getPath()
+ .toUri().getPath());
+ wrap.mkdir(new Path("/filed_away/alphabet"),
+ new FsPermission((short) 0777), true);
+ wrap.mkdir(new Path("/filed_away/alphabet/abc"),
+ new FsPermission((short) 0777), true);
+ statuses = wrap.globStatus(new Path("/filed*/alph*/*b*"),
+ new AcceptAllPathFilter());
+ Assert.assertEquals(1, statuses.length);
+ Assert.assertEquals("/filed_away/alphabet/abc", statuses[0].getPath()
+ .toUri().getPath());
+ } finally {
+ fs.delete(new Path("/filed"), true);
+ fs.delete(new Path("/filed_away"), true);
+ }
+ }
+ }
+
+ @Test
+ public void testNonTerminalGlobsOnFS() throws Exception {
+ testOnFileSystem(new TestNonTerminalGlobs(false));
+ }
+
+ @Test
+ public void testLocalFilesystem() throws Exception {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.getLocal(conf);
+ String localTmp = System.getProperty("java.io.tmpdir");
+ Path base = new Path(new Path(localTmp), UUID.randomUUID().toString());
+ Assert.assertTrue(fs.mkdirs(base));
+ Assert.assertTrue(fs.mkdirs(new Path(base, "e")));
+ Assert.assertTrue(fs.mkdirs(new Path(base, "c")));
+ Assert.assertTrue(fs.mkdirs(new Path(base, "a")));
+ Assert.assertTrue(fs.mkdirs(new Path(base, "d")));
+ Assert.assertTrue(fs.mkdirs(new Path(base, "b")));
+ fs.deleteOnExit(base);
+ FileStatus[] status = fs.globStatus(new Path(base, "*"));
+ ArrayList list = new ArrayList<>();
+ for (FileStatus f : status) {
+ list.add(f.getPath().toString());
+ }
+ boolean sorted = Ordering.natural().isOrdered(list);
+ Assert.assertTrue(sorted);
+ }
+}
+
diff --git a/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AGlobPerformance.java b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AGlobPerformance.java
new file mode 100644
index 0000000000000..d31cc86770d6f
--- /dev/null
+++ b/hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/s3a/scale/ITestS3AGlobPerformance.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.s3a.scale;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.fs.s3a.S3AFileSystem;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
+import static org.apache.hadoop.fs.s3a.S3ATestUtils.*;
+import static org.apache.hadoop.fs.s3a.Statistic.*;
+
+/**
+ * Test the performance of glob operations.
+ */
+public class ITestS3AGlobPerformance extends S3AScaleTestBase {
+ private static final Logger LOG = LoggerFactory.getLogger(
+ ITestS3AGlobPerformance.class);
+
+ @Test
+ public void testGlobOperations() throws Throwable {
+ describe("Test recursive list operations");
+ final Path scaleTestDir = getTestPath();
+ final Path listDir = new Path(scaleTestDir, "lists");
+ S3AFileSystem fs = getFileSystem();
+
+ // scale factor.
+ int scale = getConf().getInt(KEY_DIRECTORY_COUNT, DEFAULT_DIRECTORY_COUNT);
+ int width = scale;
+ int depth = scale;
+ int files = scale;
+ MetricDiff metadataRequests = new MetricDiff(fs, OBJECT_METADATA_REQUESTS);
+ MetricDiff listRequests = new MetricDiff(fs, OBJECT_LIST_REQUESTS);
+ MetricDiff listContinueRequests =
+ new MetricDiff(fs, OBJECT_CONTINUE_LIST_REQUESTS);
+ MetricDiff listStatusCalls = new MetricDiff(fs, INVOCATION_LIST_FILES);
+ MetricDiff globStatusCalls = new MetricDiff(fs, INVOCATION_GLOB_STATUS);
+ MetricDiff getFileStatusCalls =
+ new MetricDiff(fs, INVOCATION_GET_FILE_STATUS);
+ NanoTimer createTimer = new NanoTimer();
+ TreeScanResults created =
+ createSubdirs(fs, listDir, depth, width, files, 0);
+ // add some empty directories
+ int emptyDepth = 1 * scale;
+ int emptyWidth = 3 * scale;
+
+ created.add(createSubdirs(fs, listDir, emptyDepth, emptyWidth, 0,
+ 0, "empty", "f-", ""));
+ createTimer.end("Time to create %s", created);
+ LOG.info("Time per operation: {}",
+ toHuman(createTimer.nanosPerOperation(created.totalCount())));
+ printThenReset(LOG,
+ metadataRequests,
+ globStatusCalls,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+
+ try {
+ // Scan the directory via an explicit tree walk.
+ // This is the baseline for any listing speedups.
+ describe("Listing files via treewalk");
+ NanoTimer treeWalkTimer = new NanoTimer();
+ TreeScanResults treewalkResults = treeWalk(fs, listDir);
+ treeWalkTimer.end("List status via treewalk of %s", created);
+
+ printThenReset(LOG,
+ metadataRequests,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+ Path globPattern = new Path(listDir, "{*/*.txt,*.txt}");
+
+ TreeScanResults globAll = compareGlobs(globPattern);
+ treewalkResults.assertFieldsEquivalent(
+ "Files found in s3 glob(" + globPattern + ")", globAll,
+ treewalkResults.getFiles(),
+ globAll.getFiles());
+
+ compareGlobs(new Path(listDir, "*.txt"));
+ compareGlobs(new Path(listDir, "*"));
+
+
+ } finally {
+ describe("deletion");
+ // deletion at the end of the run
+ NanoTimer deleteTimer = new NanoTimer();
+ fs.delete(listDir, true);
+ deleteTimer.end("Deleting directory tree");
+ printThenReset(LOG,
+ metadataRequests,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+ }
+ }
+
+ private TreeScanResults compareGlobs(Path globPattern) throws IOException {
+ S3AFileSystem fs = getFileSystem();
+
+ MetricDiff metadataRequests = new MetricDiff(fs, OBJECT_METADATA_REQUESTS);
+ MetricDiff listRequests = new MetricDiff(fs, OBJECT_LIST_REQUESTS);
+ MetricDiff listContinueRequests =
+ new MetricDiff(fs, OBJECT_CONTINUE_LIST_REQUESTS);
+ MetricDiff listStatusCalls = new MetricDiff(fs, INVOCATION_LIST_FILES);
+ MetricDiff globStatusCalls = new MetricDiff(fs, INVOCATION_GLOB_STATUS);
+ MetricDiff getFileStatusCalls =
+ new MetricDiff(fs, INVOCATION_GET_FILE_STATUS);
+
+ describe("FileSystem.globStatus operation on %s", globPattern);
+ TreeScanResults classicResults = classicGlob(globPattern);
+ printThenReset(LOG,
+ metadataRequests,
+ globStatusCalls,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+
+ long getFileStatusCount = getFileStatusCalls.diff();
+ long listRequestCount = listRequests.diff();
+ long listStatusCount = listStatusCalls.diff();
+
+ describe("S3A lobStatus operation on %s", globPattern);
+ TreeScanResults s3aGlobResults = s3aGlob(globPattern);
+ printThenReset(LOG,
+ metadataRequests,
+ globStatusCalls,
+ listRequests,
+ listContinueRequests,
+ listStatusCalls,
+ getFileStatusCalls);
+ classicResults.assertEquivalent(s3aGlobResults);
+
+ return s3aGlobResults;
+ }
+
+ /**
+ * Baseline: classic glob operation. Timing is logged.
+ * @param globPattern pattern to glob
+ * @return the results of the scan
+ * @throws IOException IO problems.
+ */
+ private TreeScanResults classicGlob(Path globPattern)
+ throws IOException {
+ NanoTimer timer = new NanoTimer();
+ TreeScanResults results = new TreeScanResults(
+ getFileSystem().globStatusClassic(globPattern, trueFilter));
+ timer.end("Classic Glob of %s: %s", globPattern, results);
+ return results;
+ }
+
+ /**
+ * Optimized S3A glob operation. Timing is logged.
+ * @param globPattern pattern to glob
+ * @return the results of the scan
+ * @throws IOException IO problems.
+ */
+ private TreeScanResults s3aGlob(Path globPattern)
+ throws IOException {
+ NanoTimer timer = new NanoTimer();
+ TreeScanResults results = new TreeScanResults(
+ getFileSystem().globStatus(globPattern, trueFilter));
+ timer.end("S3A Glob of %s: %s", globPattern, results);
+ return results;
+ }
+
+ /**
+ * Accept all paths.
+ */
+ private static class AcceptAllPathFilter implements PathFilter {
+ @Override
+ public boolean accept(Path path) {
+ return true;
+ }
+ }
+
+ private static final PathFilter trueFilter = new AcceptAllPathFilter();
+
+
+
+}