Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,16 @@ protected URI canonicalizeUri(URI rawUri) {
return S3xLoginHelper.canonicalizeUri(rawUri, getDefaultPort());
}

/**
* Make this protected method public so that {@link S3AGlobber} can access it}.
* @param p path
* @return the fixed path.
*/
@Override
public Path fixRelativePart(Path p) {
return super.fixRelativePart(p);
}

/**
* Opens an FSDataInputStream at the indicated Path.
* @param f the file name to open
Expand Down Expand Up @@ -2114,23 +2124,42 @@ int getMaxKeys() {
}

/**
* Override superclass to use the new {@code S3AGlobber}.
* Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
* {@inheritDoc}
*/
@Override
public FileStatus[] globStatus(Path pathPattern) throws IOException {
incrementStatistic(INVOCATION_GLOB_STATUS);
return super.globStatus(pathPattern);
return globStatus(pathPattern, ACCEPT_ALL);
}

/**
* Override superclass so as to add statistic collection.
* Override superclass to use the new {@code S3AGlobber}.
* Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
* {@inheritDoc}
*/
@Override
public FileStatus[] globStatus(Path pathPattern, PathFilter filter)
throws IOException {
incrementStatistic(INVOCATION_GLOB_STATUS);
return new S3AGlobber(this, pathPattern, filter).glob();
}

/**
* Invoke the base {@link FileSystem#globStatus(Path, PathFilter)}.
* This is purely to allow tests to compare both the performance
* and results of the glob operation.
* Increments the statistic {@link Statistic#INVOCATION_GLOB_STATUS}.
* <p>
* <i>For Testing only.</i>
* @param pathPattern a regular expression specifying the path pattern
* @param filter a user-supplied path filter
* @return an array of FileStatus objects
* @throws IOException if any I/O error occurs when fetching file status
*/
public FileStatus[] globStatusClassic(Path pathPattern, PathFilter filter)
throws IOException {
incrementStatistic(INVOCATION_GLOB_STATUS);
return super.globStatus(pathPattern, filter);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs.s3a;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FsTracer;
import org.apache.hadoop.fs.GlobExpander;
import org.apache.hadoop.fs.GlobFilter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.htrace.core.TraceScope;
import org.apache.htrace.core.Tracer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* A special variant of the {@code Globber} class, designed for S3A performance.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
class S3AGlobber {
public static final Logger LOG = LoggerFactory.getLogger(S3AGlobber.class);

private final S3AFileSystem fs;
private final Path pathPattern;
private final PathFilter filter;
private final Tracer tracer;

S3AGlobber(S3AFileSystem fs, Path pathPattern, PathFilter filter) {
this.fs = fs;
this.pathPattern = pathPattern;
this.filter = filter;
this.tracer = FsTracer.get(fs.getConf());
}

private FileStatus getFileStatus(Path path) throws IOException {
try {
return fs.getFileStatus(path);
} catch (FileNotFoundException e) {
return null;
}
}

private FileStatus[] listStatus(Path path) throws IOException {
try {
return fs.listStatus(path);
} catch (FileNotFoundException e) {
return new FileStatus[0];
}
}

private Path fixRelativePart(Path path) {
return fs.fixRelativePart(path);
}

/**
* Convert a path component that contains backslash escape sequences to a
* literal string. This is necessary when you want to explicitly refer to a
* path that contains globber metacharacters.
*/
private static String unescapePathComponent(String name) {
return name.replaceAll("\\\\(.)", "$1");
}

/**
* Translate an absolute path into a list of path components.
* We merge double slashes into a single slash here.
* POSIX root path, i.e. '/', does not get an entry in the list.
*/
private static List<String> getPathComponents(String path)
throws IOException {
ArrayList<String> ret = new ArrayList<>();
for (String component : path.split(Path.SEPARATOR)) {
if (!component.isEmpty()) {
ret.add(component);
}
}
return ret;
}

private String schemeFromPath(Path path) throws IOException {
String scheme = path.toUri().getScheme();
if (scheme == null) {
scheme = fs.getUri().getScheme();
}
return scheme;
}

private String authorityFromPath(Path path) throws IOException {
String authority = path.toUri().getAuthority();
if (authority == null) {
if (fs != null) {
authority = fs.getUri().getAuthority();
}
}
return authority;
}

public FileStatus[] glob() throws IOException {
try(TraceScope scope = tracer.newScope("S3AGlobber#glob")) {
scope.addKVAnnotation("pattern", pathPattern.toUri().getPath());
return doGlob();
}
}

private FileStatus[] doGlob() throws IOException {
// First we get the scheme and authority of the pattern that was passed
// in.
String scheme = schemeFromPath(pathPattern);
String authority = authorityFromPath(pathPattern);

// Next we strip off everything except the pathname itself, and expand all
// globs. Expansion is a process which turns "grouping" clauses,
// expressed as brackets, into separate path patterns.
String pathPatternString = pathPattern.toUri().getPath();
List<String> flattenedPatterns = GlobExpander.expand(pathPatternString);

// Now loop over all flattened patterns. In every case, we'll be trying to
// match them to entries in the filesystem.
ArrayList<FileStatus> results =
new ArrayList<>(flattenedPatterns.size());

boolean sawWildcard = false;
for (String flatPattern : flattenedPatterns) {
// Get the absolute path for this flattened pattern. We couldn't do
// this prior to flattening because of patterns like {/,a}, where which
// path you go down influences how the path must be made absolute.
Path absPattern = fixRelativePart(new Path(
flatPattern.isEmpty() ? Path.CUR_DIR : flatPattern));
// Now we break the flattened, absolute pattern into path components.
// For example, /a/*/c would be broken into the list [a, *, c]
List<String> components =
getPathComponents(absPattern.toUri().getPath());
// Starting out at the root of the filesystem, we try to match
// filesystem entries against pattern components.
ArrayList<FileStatus> candidates = new ArrayList<>(1);
// To get the "real" FileStatus of root, we'd have to do an expensive
// RPC to the NameNode. So we create a placeholder FileStatus which has
// the correct path, but defaults for the rest of the information.
// Later, if it turns out we actually want the FileStatus of root, we'll
// replace the placeholder with a real FileStatus obtained from the
// NameNode.
FileStatus rootPlaceholder;
if (Path.WINDOWS && !components.isEmpty()
&& Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
// On Windows the path could begin with a drive letter, e.g. /E:/foo.
// We will skip matching the drive letter and start from listing the
// root of the filesystem on that drive.
String driveLetter = components.remove(0);
rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
} else {
rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
new Path(scheme, authority, Path.SEPARATOR));
}
candidates.add(rootPlaceholder);

for (int componentIdx = 0; componentIdx < components.size();
componentIdx++) {
ArrayList<FileStatus> newCandidates =
new ArrayList<>(candidates.size());
GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
String component = unescapePathComponent(components.get(componentIdx));
if (globFilter.hasPattern()) {
sawWildcard = true;
}
if (candidates.isEmpty() && sawWildcard) {
// Optimization: if there are no more candidates left, stop examining
// the path components. We can only do this if we've already seen
// a wildcard component-- otherwise, we still need to visit all path
// components in case one of them is a wildcard.
break;
}
if ((componentIdx < components.size() - 1) &&
(!globFilter.hasPattern())) {
// Optimization: if this is not the terminal path component, and we
// are not matching against a glob, assume that it exists. If it
// doesn't exist, we'll find out later when resolving a later glob
// or the terminal path component.
for (FileStatus candidate : candidates) {
candidate.setPath(new Path(candidate.getPath(), component));
}
continue;
}
for (FileStatus candidate : candidates) {
if (globFilter.hasPattern()) {
FileStatus[] children = listStatus(candidate.getPath());
if (children.length == 1) {
// If we get back only one result, this could be either a listing
// of a directory with one entry, or it could reflect the fact
// that what we listed resolved to a file.
//
// Unfortunately, we can't just compare the returned paths to
// figure this out. Consider the case where you have /a/b, where
// b is a symlink to "..". In that case, listing /a/b will give
// back "/a/b" again. If we just went by returned pathname, we'd
// incorrectly conclude that /a/b was a file and should not match
// /a/*/*. So we use getFileStatus of the path we just listed to
// disambiguate.
if (!getFileStatus(candidate.getPath()).isDirectory()) {
continue;
}
}
for (FileStatus child : children) {
if (componentIdx < components.size() - 1) {
// Don't try to recurse into non-directories. See HADOOP-10957.
if (!child.isDirectory()) continue;
}
// Set the child path based on the parent path.
child.setPath(new Path(candidate.getPath(),
child.getPath().getName()));
if (globFilter.accept(child.getPath())) {
newCandidates.add(child);
}
}
} else {
// When dealing with non-glob components, use getFileStatus
// instead of listStatus. This is an optimization, but it also
// is necessary for correctness in HDFS, since there are some
// special HDFS directories like .reserved and .snapshot that are
// not visible to listStatus, but which do exist. (See HADOOP-9877)
FileStatus childStatus = getFileStatus(
new Path(candidate.getPath(), component));
if (childStatus != null) {
newCandidates.add(childStatus);
}
}
}
candidates = newCandidates;
}
for (FileStatus status : candidates) {
// Use object equality to see if this status is the root placeholder.
// See the explanation for rootPlaceholder above for more information.
if (status == rootPlaceholder) {
status = getFileStatus(rootPlaceholder.getPath());
if (status == null) continue;
}
// HADOOP-3497 semantics: the user-defined filter is applied at the
// end, once the full path is built up.
if (filter.accept(status.getPath())) {
results.add(status);
}
}
}
/*
* When the input pattern "looks" like just a simple filename, and we
* can't find it, we return null rather than an empty array.
* This is a special case which the shell relies on.
*
* To be more precise: if there were no results, AND there were no
* groupings (aka brackets), and no wildcards in the input (aka stars),
* we return null.
*/
if ((!sawWildcard) && results.isEmpty() &&
(flattenedPatterns.size() <= 1)) {
return null;
}
/*
* In general, the results list will already be sorted, since listStatus
* returns results in sorted order for many Hadoop filesystems. However,
* not all Hadoop filesystems have this property. So we sort here in order
* to get consistent results. See HADOOP-10798 for details.
*/
FileStatus ret[] = results.toArray(new FileStatus[results.size()]);
Arrays.sort(ret);
return ret;
}
}
Loading