HADOOP-9981. globStatus should minimize its listStatus and getFileStatus calls. (Contributed by Colin Patrick McCabe)
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1526300 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
101664055c
commit
138fd7581f
|
@ -81,6 +81,9 @@ Release 2.3.0 - UNRELEASED
|
|||
HADOOP-9791. Add a test case covering long paths for new FileUtil access
|
||||
check methods (ivanmi)
|
||||
|
||||
HADOOP-9981. globStatus should minimize its listStatus and getFileStatus
|
||||
calls. (Contributed by Colin Patrick McCabe)
|
||||
|
||||
Release 2.2.0 - UNRELEASED
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -83,6 +83,15 @@ class Globber {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a path component that contains backslash ecape sequences to a
|
||||
* literal string. This is necessary when you want to explicitly refer to a
|
||||
* path that contains globber metacharacters.
|
||||
*/
|
||||
private static String unescapePathComponent(String name) {
|
||||
return name.replaceAll("\\\\(.)", "$1");
|
||||
}
|
||||
|
||||
/**
|
||||
* Translate an absolute path into a list of path components.
|
||||
* We merge double slashes into a single slash here.
|
||||
|
@ -166,37 +175,72 @@ class Globber {
|
|||
new Path(scheme, authority, Path.SEPARATOR)));
|
||||
}
|
||||
|
||||
for (String component : components) {
|
||||
for (int componentIdx = 0; componentIdx < components.size();
|
||||
componentIdx++) {
|
||||
ArrayList<FileStatus> newCandidates =
|
||||
new ArrayList<FileStatus>(candidates.size());
|
||||
GlobFilter globFilter = new GlobFilter(component);
|
||||
GlobFilter globFilter = new GlobFilter(components.get(componentIdx));
|
||||
String component = unescapePathComponent(components.get(componentIdx));
|
||||
if (globFilter.hasPattern()) {
|
||||
sawWildcard = true;
|
||||
}
|
||||
if (candidates.isEmpty() && sawWildcard) {
|
||||
// Optimization: if there are no more candidates left, stop examining
|
||||
// the path components. We can only do this if we've already seen
|
||||
// a wildcard component-- otherwise, we still need to visit all path
|
||||
// components in case one of them is a wildcard.
|
||||
break;
|
||||
}
|
||||
if ((componentIdx < components.size() - 1) &&
|
||||
(!globFilter.hasPattern())) {
|
||||
// Optimization: if this is not the terminal path component, and we
|
||||
// are not matching against a glob, assume that it exists. If it
|
||||
// doesn't exist, we'll find out later when resolving a later glob
|
||||
// or the terminal path component.
|
||||
for (FileStatus candidate : candidates) {
|
||||
candidate.setPath(new Path(candidate.getPath(), component));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
for (FileStatus candidate : candidates) {
|
||||
FileStatus resolvedCandidate = candidate;
|
||||
if (candidate.isSymlink()) {
|
||||
// We have to resolve symlinks, because otherwise we don't know
|
||||
// whether they are directories.
|
||||
resolvedCandidate = getFileStatus(candidate.getPath());
|
||||
}
|
||||
if (resolvedCandidate == null ||
|
||||
resolvedCandidate.isDirectory() == false) {
|
||||
continue;
|
||||
}
|
||||
FileStatus[] children = listStatus(candidate.getPath());
|
||||
for (FileStatus child : children) {
|
||||
// Set the child path based on the parent path.
|
||||
// This keeps the symlinks in our path.
|
||||
child.setPath(new Path(candidate.getPath(),
|
||||
child.getPath().getName()));
|
||||
if (globFilter.accept(child.getPath())) {
|
||||
newCandidates.add(child);
|
||||
if (globFilter.hasPattern()) {
|
||||
FileStatus[] children = listStatus(candidate.getPath());
|
||||
if (children.length == 1) {
|
||||
// If we get back only one result, this could be either a listing
|
||||
// of a directory with one entry, or it could reflect the fact
|
||||
// that what we listed resolved to a file.
|
||||
//
|
||||
// Unfortunately, we can't just compare the returned paths to
|
||||
// figure this out. Consider the case where you have /a/b, where
|
||||
// b is a symlink to "..". In that case, listing /a/b will give
|
||||
// back "/a/b" again. If we just went by returned pathname, we'd
|
||||
// incorrectly conclude that /a/b was a file and should not match
|
||||
// /a/*/*. So we use getFileStatus of the path we just listed to
|
||||
// disambiguate.
|
||||
if (!getFileStatus(candidate.getPath()).isDirectory()) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (FileStatus child : children) {
|
||||
// Set the child path based on the parent path.
|
||||
child.setPath(new Path(candidate.getPath(),
|
||||
child.getPath().getName()));
|
||||
if (globFilter.accept(child.getPath())) {
|
||||
newCandidates.add(child);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// When dealing with non-glob components, use getFileStatus
|
||||
// instead of listStatus. This is an optimization, but it also
|
||||
// is necessary for correctness in HDFS, since there are some
|
||||
// special HDFS directories like .reserved and .snapshot that are
|
||||
// not visible to listStatus, but which do exist. (See HADOOP-9877)
|
||||
FileStatus childStatus = getFileStatus(
|
||||
new Path(candidate.getPath(), component));
|
||||
if (childStatus != null) {
|
||||
newCandidates.add(childStatus);
|
||||
}
|
||||
}
|
||||
}
|
||||
candidates = newCandidates;
|
||||
}
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.server.namenode.INodeId;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.junit.*;
|
||||
|
@ -362,17 +363,6 @@ public class TestGlobPaths {
|
|||
status = fs.globStatus(new Path(USER_DIR+"{/dir*}"));
|
||||
checkStatus(status, d1, d2, d3, d4);
|
||||
|
||||
/*
|
||||
* true filter
|
||||
*/
|
||||
|
||||
PathFilter trueFilter = new PathFilter() {
|
||||
@Override
|
||||
public boolean accept(Path path) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
status = fs.globStatus(new Path(Path.SEPARATOR), trueFilter);
|
||||
checkStatus(status, new Path(Path.SEPARATOR));
|
||||
|
||||
|
@ -844,6 +834,8 @@ public class TestGlobPaths {
|
|||
}
|
||||
}
|
||||
|
||||
private static final PathFilter trueFilter = new AcceptAllPathFilter();
|
||||
|
||||
/**
|
||||
* Accept only paths ending in Z.
|
||||
*/
|
||||
|
@ -894,11 +886,13 @@ public class TestGlobPaths {
|
|||
}
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobWithSymlinksOnFS() throws Exception {
|
||||
testOnFileSystem(new TestGlobWithSymlinks());
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobWithSymlinksOnFC() throws Exception {
|
||||
testOnFileContext(new TestGlobWithSymlinks());
|
||||
|
@ -971,11 +965,13 @@ public class TestGlobPaths {
|
|||
}
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobWithSymlinksToSymlinksOnFS() throws Exception {
|
||||
testOnFileSystem(new TestGlobWithSymlinksToSymlinks());
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobWithSymlinksToSymlinksOnFC() throws Exception {
|
||||
testOnFileContext(new TestGlobWithSymlinksToSymlinks());
|
||||
|
@ -1020,11 +1016,13 @@ public class TestGlobPaths {
|
|||
}
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobSymlinksWithCustomPathFilterOnFS() throws Exception {
|
||||
testOnFileSystem(new TestGlobSymlinksWithCustomPathFilter());
|
||||
}
|
||||
|
||||
@Ignore
|
||||
@Test
|
||||
public void testGlobSymlinksWithCustomPathFilterOnFC() throws Exception {
|
||||
testOnFileContext(new TestGlobSymlinksWithCustomPathFilter());
|
||||
|
@ -1045,7 +1043,7 @@ public class TestGlobPaths {
|
|||
new Path(USER_DIR + "/alphaLink"), new AcceptAllPathFilter());
|
||||
Assert.assertEquals(1, statuses.length);
|
||||
Path path = statuses[0].getPath();
|
||||
Assert.assertEquals(USER_DIR + "/alphaLink", path.toUri().getPath());
|
||||
Assert.assertEquals(USER_DIR + "/alpha", path.toUri().getPath());
|
||||
Assert.assertEquals("hdfs", path.toUri().getScheme());
|
||||
if (fc != null) {
|
||||
// If we're using FileContext, then we can list a file:/// URI.
|
||||
|
@ -1151,4 +1149,31 @@ public class TestGlobPaths {
|
|||
public void testGlobAccessDeniedOnFC() throws Exception {
|
||||
testOnFileContext(new TestGlobAccessDenied());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that trying to list a reserved path on HDFS via the globber works.
|
||||
**/
|
||||
private static class TestReservedHdfsPaths implements FSTestWrapperGlobTest {
|
||||
public void run(FSTestWrapper wrap, FSTestWrapper unprivilegedWrap,
|
||||
FileSystem fs, FileContext fc) throws Exception {
|
||||
String reservedRoot = "/.reserved/.inodes/" + INodeId.ROOT_INODE_ID;
|
||||
Assert.assertEquals(reservedRoot,
|
||||
TestPath.mergeStatuses(unprivilegedWrap.
|
||||
globStatus(new Path(reservedRoot), new AcceptAllPathFilter())));
|
||||
// These inodes don't show up via listStatus.
|
||||
Assert.assertEquals("",
|
||||
TestPath.mergeStatuses(unprivilegedWrap.
|
||||
globStatus(new Path("/.reserved/*"), new AcceptAllPathFilter())));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReservedHdfsPathsOnFS() throws Exception {
|
||||
testOnFileSystem(new TestReservedHdfsPaths());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReservedHdfsPathsOnFC() throws Exception {
|
||||
testOnFileContext(new TestReservedHdfsPaths());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue