HADOOP-6787. Factor out glob pattern code from FileContext and FileSystem. Contributed by Luke Lu.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@951207 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9e2c3bf9ed
commit
c6bc36580a
|
@ -57,6 +57,11 @@ Trunk (unreleased changes)
|
|||
|
||||
HADOOP-6747. TestNetUtils fails on Mac OS X. (Todd Lipcon via jghoman)
|
||||
|
||||
HADOOP-6787. Factor out glob pattern code from FileContext and
|
||||
Filesystem. Also fix bugs identified in HADOOP-6618 and make the
|
||||
glob pattern code less restrictive and more POSIX standard
|
||||
compliant. (Luke Lu via eli)
|
||||
|
||||
Release 0.21.0 - Unreleased
|
||||
|
||||
INCOMPATIBLE CHANGES
|
||||
|
|
|
@ -1874,131 +1874,6 @@ public final class FileContext {
|
|||
}
|
||||
}
|
||||
|
||||
/* A class that could decide if a string matches the glob or not */
|
||||
private static class GlobFilter implements PathFilter {
|
||||
private PathFilter userFilter = DEFAULT_FILTER;
|
||||
private Pattern regex;
|
||||
private boolean hasPattern = false;
|
||||
|
||||
/** Default pattern character: Escape any special meaning. */
|
||||
private static final char PAT_ESCAPE = '\\';
|
||||
/** Default pattern character: Any single character. */
|
||||
private static final char PAT_ANY = '.';
|
||||
/** Default pattern character: Character set close. */
|
||||
private static final char PAT_SET_CLOSE = ']';
|
||||
|
||||
GlobFilter(final String filePattern) {
|
||||
setRegex(filePattern);
|
||||
}
|
||||
|
||||
GlobFilter(final String filePattern, final PathFilter filter) {
|
||||
userFilter = filter;
|
||||
setRegex(filePattern);
|
||||
}
|
||||
|
||||
private boolean isJavaRegexSpecialChar(char pChar) {
|
||||
return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' ||
|
||||
pChar == '|' || pChar == '+';
|
||||
}
|
||||
|
||||
void setRegex(String filePattern) {
|
||||
int len;
|
||||
int setOpen;
|
||||
int curlyOpen;
|
||||
boolean setRange;
|
||||
|
||||
StringBuilder fileRegex = new StringBuilder();
|
||||
|
||||
// Validate the pattern
|
||||
len = filePattern.length();
|
||||
if (len == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
setOpen = 0;
|
||||
setRange = false;
|
||||
curlyOpen = 0;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
char pCh;
|
||||
|
||||
// Examine a single pattern character
|
||||
pCh = filePattern.charAt(i);
|
||||
if (pCh == PAT_ESCAPE) {
|
||||
fileRegex.append(pCh);
|
||||
i++;
|
||||
if (i >= len) {
|
||||
error("An escaped character does not present", filePattern, i);
|
||||
}
|
||||
pCh = filePattern.charAt(i);
|
||||
} else if (isJavaRegexSpecialChar(pCh)) {
|
||||
fileRegex.append(PAT_ESCAPE);
|
||||
} else if (pCh == '*') {
|
||||
fileRegex.append(PAT_ANY);
|
||||
hasPattern = true;
|
||||
} else if (pCh == '?') {
|
||||
pCh = PAT_ANY;
|
||||
hasPattern = true;
|
||||
} else if (pCh == '{') {
|
||||
fileRegex.append('(');
|
||||
pCh = '(';
|
||||
curlyOpen++;
|
||||
hasPattern = true;
|
||||
} else if (pCh == ',' && curlyOpen > 0) {
|
||||
fileRegex.append(")|");
|
||||
pCh = '(';
|
||||
} else if (pCh == '}' && curlyOpen > 0) {
|
||||
// End of a group
|
||||
curlyOpen--;
|
||||
fileRegex.append(")");
|
||||
pCh = ')';
|
||||
} else if (pCh == '[' && setOpen == 0) {
|
||||
setOpen++;
|
||||
hasPattern = true;
|
||||
} else if (pCh == '^' && setOpen > 0) {
|
||||
} else if (pCh == '-' && setOpen > 0) {
|
||||
// Character set range
|
||||
setRange = true;
|
||||
} else if (pCh == PAT_SET_CLOSE && setRange) {
|
||||
// Incomplete character set range
|
||||
error("Incomplete character set range", filePattern, i);
|
||||
} else if (pCh == PAT_SET_CLOSE && setOpen > 0) {
|
||||
// End of a character set
|
||||
if (setOpen < 2) {
|
||||
error("Unexpected end of set", filePattern, i);
|
||||
}
|
||||
setOpen = 0;
|
||||
} else if (setOpen > 0) {
|
||||
// Normal character, or the end of a character set range
|
||||
setOpen++;
|
||||
setRange = false;
|
||||
}
|
||||
fileRegex.append(pCh);
|
||||
}
|
||||
|
||||
// Check for a well-formed pattern
|
||||
if (setOpen > 0 || setRange || curlyOpen > 0) {
|
||||
// Incomplete character set or character range
|
||||
error("Expecting set closure character or end of range, or }",
|
||||
filePattern, len);
|
||||
}
|
||||
regex = Pattern.compile(fileRegex.toString());
|
||||
}
|
||||
|
||||
boolean hasPattern() {
|
||||
return hasPattern;
|
||||
}
|
||||
|
||||
public boolean accept(final Path path) {
|
||||
return regex.matcher(path.getName()).matches() && userFilter.accept(path);
|
||||
}
|
||||
|
||||
private void error(final String s, final String pattern, final int pos) {
|
||||
throw new HadoopIllegalArgumentException("Illegal file pattern: " + s
|
||||
+ " for glob " + pattern + " at " + pos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if copying srcName to dst would overwrite an existing
|
||||
* file or directory.
|
||||
|
|
|
@ -1369,127 +1369,6 @@ public abstract class FileSystem extends Configured implements Closeable {
|
|||
return globPathsLevel(parents, filePattern, level + 1, hasGlob);
|
||||
}
|
||||
|
||||
/* A class that could decide if a string matches the glob or not */
|
||||
private static class GlobFilter implements PathFilter {
|
||||
private PathFilter userFilter = DEFAULT_FILTER;
|
||||
private Pattern regex;
|
||||
private boolean hasPattern = false;
|
||||
|
||||
/** Default pattern character: Escape any special meaning. */
|
||||
private static final char PAT_ESCAPE = '\\';
|
||||
/** Default pattern character: Any single character. */
|
||||
private static final char PAT_ANY = '.';
|
||||
/** Default pattern character: Character set close. */
|
||||
private static final char PAT_SET_CLOSE = ']';
|
||||
|
||||
GlobFilter(String filePattern) throws IOException {
|
||||
setRegex(filePattern);
|
||||
}
|
||||
|
||||
GlobFilter(String filePattern, PathFilter filter) throws IOException {
|
||||
userFilter = filter;
|
||||
setRegex(filePattern);
|
||||
}
|
||||
|
||||
private boolean isJavaRegexSpecialChar(char pChar) {
|
||||
return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' ||
|
||||
pChar == '|' || pChar == '+';
|
||||
}
|
||||
void setRegex(String filePattern) throws IOException {
|
||||
int len;
|
||||
int setOpen;
|
||||
int curlyOpen;
|
||||
boolean setRange;
|
||||
|
||||
StringBuilder fileRegex = new StringBuilder();
|
||||
|
||||
// Validate the pattern
|
||||
len = filePattern.length();
|
||||
if (len == 0)
|
||||
return;
|
||||
|
||||
setOpen = 0;
|
||||
setRange = false;
|
||||
curlyOpen = 0;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
char pCh;
|
||||
|
||||
// Examine a single pattern character
|
||||
pCh = filePattern.charAt(i);
|
||||
if (pCh == PAT_ESCAPE) {
|
||||
fileRegex.append(pCh);
|
||||
i++;
|
||||
if (i >= len)
|
||||
error("An escaped character does not present", filePattern, i);
|
||||
pCh = filePattern.charAt(i);
|
||||
} else if (isJavaRegexSpecialChar(pCh)) {
|
||||
fileRegex.append(PAT_ESCAPE);
|
||||
} else if (pCh == '*') {
|
||||
fileRegex.append(PAT_ANY);
|
||||
hasPattern = true;
|
||||
} else if (pCh == '?') {
|
||||
pCh = PAT_ANY;
|
||||
hasPattern = true;
|
||||
} else if (pCh == '{') {
|
||||
fileRegex.append('(');
|
||||
pCh = '(';
|
||||
curlyOpen++;
|
||||
hasPattern = true;
|
||||
} else if (pCh == ',' && curlyOpen > 0) {
|
||||
fileRegex.append(")|");
|
||||
pCh = '(';
|
||||
} else if (pCh == '}' && curlyOpen > 0) {
|
||||
// End of a group
|
||||
curlyOpen--;
|
||||
fileRegex.append(")");
|
||||
pCh = ')';
|
||||
} else if (pCh == '[' && setOpen == 0) {
|
||||
setOpen++;
|
||||
hasPattern = true;
|
||||
} else if (pCh == '^' && setOpen > 0) {
|
||||
} else if (pCh == '-' && setOpen > 0) {
|
||||
// Character set range
|
||||
setRange = true;
|
||||
} else if (pCh == PAT_SET_CLOSE && setRange) {
|
||||
// Incomplete character set range
|
||||
error("Incomplete character set range", filePattern, i);
|
||||
} else if (pCh == PAT_SET_CLOSE && setOpen > 0) {
|
||||
// End of a character set
|
||||
if (setOpen < 2)
|
||||
error("Unexpected end of set", filePattern, i);
|
||||
setOpen = 0;
|
||||
} else if (setOpen > 0) {
|
||||
// Normal character, or the end of a character set range
|
||||
setOpen++;
|
||||
setRange = false;
|
||||
}
|
||||
fileRegex.append(pCh);
|
||||
}
|
||||
|
||||
// Check for a well-formed pattern
|
||||
if (setOpen > 0 || setRange || curlyOpen > 0) {
|
||||
// Incomplete character set or character range
|
||||
error("Expecting set closure character or end of range, or }",
|
||||
filePattern, len);
|
||||
}
|
||||
regex = Pattern.compile(fileRegex.toString());
|
||||
}
|
||||
|
||||
boolean hasPattern() {
|
||||
return hasPattern;
|
||||
}
|
||||
|
||||
public boolean accept(Path path) {
|
||||
return regex.matcher(path.getName()).matches() && userFilter.accept(path);
|
||||
}
|
||||
|
||||
private void error(String s, String pattern, int pos) throws IOException {
|
||||
throw new IOException("Illegal file pattern: "
|
||||
+s+ " for glob "+ pattern + " at " + pos);
|
||||
}
|
||||
}
|
||||
|
||||
/** Return the current user's home directory in this filesystem.
|
||||
* The default implementation returns "/user/$USER/".
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.fs;
|
||||
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
import java.io.IOException;
|
||||
|
||||
// A class that could decide if a string matches the glob or not
|
||||
class GlobFilter implements PathFilter {
|
||||
private final static PathFilter DEFAULT_FILTER = new PathFilter() {
|
||||
public boolean accept(Path file) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
private PathFilter userFilter = DEFAULT_FILTER;
|
||||
private GlobPattern pattern;
|
||||
|
||||
GlobFilter(String filePattern) throws IOException {
|
||||
init(filePattern, DEFAULT_FILTER);
|
||||
}
|
||||
|
||||
GlobFilter(String filePattern, PathFilter filter) throws IOException {
|
||||
init(filePattern, filter);
|
||||
}
|
||||
|
||||
void init(String filePattern, PathFilter filter) throws IOException {
|
||||
try {
|
||||
userFilter = filter;
|
||||
pattern = new GlobPattern(filePattern);
|
||||
}
|
||||
catch (PatternSyntaxException e) {
|
||||
// Existing code expects IOException startWith("Illegal file pattern")
|
||||
throw new IOException("Illegal file pattern: "+ e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasPattern() {
|
||||
return pattern.hasWildcard();
|
||||
}
|
||||
|
||||
public boolean accept(Path path) {
|
||||
return pattern.matches(path.getName()) && userFilter.accept(path);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.fs;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
|
||||
/**
|
||||
* A class for POSIX glob pattern with brace expansions.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Evolving
|
||||
public class GlobPattern {
|
||||
private static final char BACKSLASH = '\\';
|
||||
private Pattern compiled;
|
||||
private boolean hasWildcard = false;
|
||||
|
||||
/**
|
||||
* Construct the glob pattern object with a glob pattern string
|
||||
* @param globPattern the glob pattern string
|
||||
*/
|
||||
public GlobPattern(String globPattern) {
|
||||
set(globPattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the compiled pattern
|
||||
*/
|
||||
public Pattern compiled() {
|
||||
return compiled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compile glob pattern string
|
||||
* @param globPattern the glob pattern
|
||||
* @return the pattern object
|
||||
*/
|
||||
public static Pattern compile(String globPattern) {
|
||||
return new GlobPattern(globPattern).compiled();
|
||||
}
|
||||
|
||||
/**
|
||||
* Match input against the compiled glob pattern
|
||||
* @param s input chars
|
||||
* @return true for successful matches
|
||||
*/
|
||||
public boolean matches(CharSequence s) {
|
||||
return compiled.matcher(s).matches();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set and compile a glob pattern
|
||||
* @param glob the glob pattern string
|
||||
*/
|
||||
public void set(String glob) {
|
||||
StringBuilder regex = new StringBuilder();
|
||||
int setOpen = 0;
|
||||
int curlyOpen = 0;
|
||||
int len = glob.length();
|
||||
hasWildcard = false;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
char c = glob.charAt(i);
|
||||
|
||||
switch (c) {
|
||||
case BACKSLASH:
|
||||
if (++i >= len) {
|
||||
error("Missing escaped character", glob, i);
|
||||
}
|
||||
regex.append(c).append(glob.charAt(i));
|
||||
continue;
|
||||
case '.':
|
||||
case '$':
|
||||
case '(':
|
||||
case ')':
|
||||
case '|':
|
||||
case '+':
|
||||
// escape regex special chars that are not glob special chars
|
||||
regex.append(BACKSLASH);
|
||||
break;
|
||||
case '*':
|
||||
regex.append('.');
|
||||
hasWildcard = true;
|
||||
break;
|
||||
case '?':
|
||||
regex.append('.');
|
||||
hasWildcard = true;
|
||||
continue;
|
||||
case '{': // start of a group
|
||||
regex.append("(?:"); // non-capturing
|
||||
curlyOpen++;
|
||||
hasWildcard = true;
|
||||
continue;
|
||||
case ',':
|
||||
regex.append(curlyOpen > 0 ? '|' : c);
|
||||
continue;
|
||||
case '}':
|
||||
if (curlyOpen > 0) {
|
||||
// end of a group
|
||||
curlyOpen--;
|
||||
regex.append(")");
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
case '[':
|
||||
if (setOpen > 0) {
|
||||
error("Unclosed character class", glob, i);
|
||||
}
|
||||
setOpen++;
|
||||
hasWildcard = true;
|
||||
break;
|
||||
case '^': // ^ inside [...] can be unescaped
|
||||
if (setOpen == 0) {
|
||||
regex.append(BACKSLASH);
|
||||
}
|
||||
break;
|
||||
case '!': // [! needs to be translated to [^
|
||||
regex.append(setOpen > 0 && '[' == glob.charAt(i - 1) ? '^' : '!');
|
||||
continue;
|
||||
case ']':
|
||||
// Many set errors like [][] could not be easily detected here,
|
||||
// as []], []-] and [-] are all valid POSIX glob and java regex.
|
||||
// We'll just let the regex compiler do the real work.
|
||||
setOpen = 0;
|
||||
break;
|
||||
default:
|
||||
}
|
||||
regex.append(c);
|
||||
}
|
||||
|
||||
if (setOpen > 0) {
|
||||
error("Unclosed character class", glob, len);
|
||||
}
|
||||
if (curlyOpen > 0) {
|
||||
error("Unclosed group", glob, len);
|
||||
}
|
||||
compiled = Pattern.compile(regex.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this is a wildcard pattern (with special chars)
|
||||
*/
|
||||
public boolean hasWildcard() {
|
||||
return hasWildcard;
|
||||
}
|
||||
|
||||
private static void error(String message, String pattern, int pos) {
|
||||
throw new PatternSyntaxException(message, pattern, pos);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hadoop.fs;
|
||||
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
/**
|
||||
* Tests for glob patterns
|
||||
*/
|
||||
public class TestGlobPattern {
|
||||
private void assertMatch(boolean yes, String glob, String...input) {
|
||||
GlobPattern pattern = new GlobPattern(glob);
|
||||
|
||||
for (String s : input) {
|
||||
boolean result = pattern.matches(s);
|
||||
assertTrue(glob +" should"+ (yes ? "" : " not") +" match "+ s,
|
||||
yes ? result : !result);
|
||||
}
|
||||
}
|
||||
|
||||
private void shouldThrow(String... globs) {
|
||||
for (String glob : globs) {
|
||||
try {
|
||||
GlobPattern.compile(glob);
|
||||
}
|
||||
catch (PatternSyntaxException e) {
|
||||
e.printStackTrace();
|
||||
continue;
|
||||
}
|
||||
assertTrue("glob "+ glob +" should throw", false);
|
||||
}
|
||||
}
|
||||
|
||||
@Test public void testValidPatterns() {
|
||||
assertMatch(true, "*", "^$", "foo", "bar");
|
||||
assertMatch(true, "?", "?", "^", "[", "]", "$");
|
||||
assertMatch(true, "foo*", "foo", "food", "fool");
|
||||
assertMatch(true, "f*d", "fud", "food");
|
||||
assertMatch(true, "*d", "good", "bad");
|
||||
assertMatch(true, "\\*\\?\\[\\{\\\\", "*?[{\\");
|
||||
assertMatch(true, "[]^-]", "]", "-", "^");
|
||||
assertMatch(true, "]", "]");
|
||||
assertMatch(true, "^.$()|+", "^.$()|+");
|
||||
assertMatch(true, "[^^]", ".", "$", "[", "]");
|
||||
assertMatch(false, "[^^]", "^");
|
||||
assertMatch(true, "[!!-]", "^", "?");
|
||||
assertMatch(false, "[!!-]", "!", "-");
|
||||
assertMatch(true, "{[12]*,[45]*,[78]*}", "1", "2!", "4", "42", "7", "7$");
|
||||
assertMatch(false, "{[12]*,[45]*,[78]*}", "3", "6", "9ß");
|
||||
assertMatch(true, "}", "}");
|
||||
}
|
||||
|
||||
@Test public void testInvalidPatterns() {
|
||||
shouldThrow("[", "[[]]", "[][]", "{", "\\");
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue