HADOOP-6787. Factor out glob pattern code from FileContext and FileSystem. Contributed by Luke Lu.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@951207 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Eli Collins 2010-06-03 23:39:33 +00:00
parent 9e2c3bf9ed
commit c6bc36580a
6 changed files with 309 additions and 246 deletions

View File

@ -57,6 +57,11 @@ Trunk (unreleased changes)
HADOOP-6747. TestNetUtils fails on Mac OS X. (Todd Lipcon via jghoman) HADOOP-6747. TestNetUtils fails on Mac OS X. (Todd Lipcon via jghoman)
HADOOP-6787. Factor out glob pattern code from FileContext and
Filesystem. Also fix bugs identified in HADOOP-6618 and make the
glob pattern code less restrictive and more POSIX standard
compliant. (Luke Lu via eli)
Release 0.21.0 - Unreleased Release 0.21.0 - Unreleased
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -1874,131 +1874,6 @@ public boolean copy(final Path src, final Path dst, boolean deleteSource,
} }
} }
/* A class that could decide if a string matches the glob or not */
private static class GlobFilter implements PathFilter {
private PathFilter userFilter = DEFAULT_FILTER;
private Pattern regex;
private boolean hasPattern = false;
/** Default pattern character: Escape any special meaning. */
private static final char PAT_ESCAPE = '\\';
/** Default pattern character: Any single character. */
private static final char PAT_ANY = '.';
/** Default pattern character: Character set close. */
private static final char PAT_SET_CLOSE = ']';
GlobFilter(final String filePattern) {
setRegex(filePattern);
}
GlobFilter(final String filePattern, final PathFilter filter) {
userFilter = filter;
setRegex(filePattern);
}
private boolean isJavaRegexSpecialChar(char pChar) {
return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' ||
pChar == '|' || pChar == '+';
}
void setRegex(String filePattern) {
int len;
int setOpen;
int curlyOpen;
boolean setRange;
StringBuilder fileRegex = new StringBuilder();
// Validate the pattern
len = filePattern.length();
if (len == 0) {
return;
}
setOpen = 0;
setRange = false;
curlyOpen = 0;
for (int i = 0; i < len; i++) {
char pCh;
// Examine a single pattern character
pCh = filePattern.charAt(i);
if (pCh == PAT_ESCAPE) {
fileRegex.append(pCh);
i++;
if (i >= len) {
error("An escaped character does not present", filePattern, i);
}
pCh = filePattern.charAt(i);
} else if (isJavaRegexSpecialChar(pCh)) {
fileRegex.append(PAT_ESCAPE);
} else if (pCh == '*') {
fileRegex.append(PAT_ANY);
hasPattern = true;
} else if (pCh == '?') {
pCh = PAT_ANY;
hasPattern = true;
} else if (pCh == '{') {
fileRegex.append('(');
pCh = '(';
curlyOpen++;
hasPattern = true;
} else if (pCh == ',' && curlyOpen > 0) {
fileRegex.append(")|");
pCh = '(';
} else if (pCh == '}' && curlyOpen > 0) {
// End of a group
curlyOpen--;
fileRegex.append(")");
pCh = ')';
} else if (pCh == '[' && setOpen == 0) {
setOpen++;
hasPattern = true;
} else if (pCh == '^' && setOpen > 0) {
} else if (pCh == '-' && setOpen > 0) {
// Character set range
setRange = true;
} else if (pCh == PAT_SET_CLOSE && setRange) {
// Incomplete character set range
error("Incomplete character set range", filePattern, i);
} else if (pCh == PAT_SET_CLOSE && setOpen > 0) {
// End of a character set
if (setOpen < 2) {
error("Unexpected end of set", filePattern, i);
}
setOpen = 0;
} else if (setOpen > 0) {
// Normal character, or the end of a character set range
setOpen++;
setRange = false;
}
fileRegex.append(pCh);
}
// Check for a well-formed pattern
if (setOpen > 0 || setRange || curlyOpen > 0) {
// Incomplete character set or character range
error("Expecting set closure character or end of range, or }",
filePattern, len);
}
regex = Pattern.compile(fileRegex.toString());
}
boolean hasPattern() {
return hasPattern;
}
public boolean accept(final Path path) {
return regex.matcher(path.getName()).matches() && userFilter.accept(path);
}
private void error(final String s, final String pattern, final int pos) {
throw new HadoopIllegalArgumentException("Illegal file pattern: " + s
+ " for glob " + pattern + " at " + pos);
}
}
/** /**
* Check if copying srcName to dst would overwrite an existing * Check if copying srcName to dst would overwrite an existing
* file or directory. * file or directory.

View File

@ -1369,127 +1369,6 @@ private Path[] globPathsLevel(Path[] parents, String[] filePattern,
return globPathsLevel(parents, filePattern, level + 1, hasGlob); return globPathsLevel(parents, filePattern, level + 1, hasGlob);
} }
/* A class that could decide if a string matches the glob or not */
private static class GlobFilter implements PathFilter {
private PathFilter userFilter = DEFAULT_FILTER;
private Pattern regex;
private boolean hasPattern = false;
/** Default pattern character: Escape any special meaning. */
private static final char PAT_ESCAPE = '\\';
/** Default pattern character: Any single character. */
private static final char PAT_ANY = '.';
/** Default pattern character: Character set close. */
private static final char PAT_SET_CLOSE = ']';
GlobFilter(String filePattern) throws IOException {
setRegex(filePattern);
}
GlobFilter(String filePattern, PathFilter filter) throws IOException {
userFilter = filter;
setRegex(filePattern);
}
private boolean isJavaRegexSpecialChar(char pChar) {
return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' ||
pChar == '|' || pChar == '+';
}
void setRegex(String filePattern) throws IOException {
int len;
int setOpen;
int curlyOpen;
boolean setRange;
StringBuilder fileRegex = new StringBuilder();
// Validate the pattern
len = filePattern.length();
if (len == 0)
return;
setOpen = 0;
setRange = false;
curlyOpen = 0;
for (int i = 0; i < len; i++) {
char pCh;
// Examine a single pattern character
pCh = filePattern.charAt(i);
if (pCh == PAT_ESCAPE) {
fileRegex.append(pCh);
i++;
if (i >= len)
error("An escaped character does not present", filePattern, i);
pCh = filePattern.charAt(i);
} else if (isJavaRegexSpecialChar(pCh)) {
fileRegex.append(PAT_ESCAPE);
} else if (pCh == '*') {
fileRegex.append(PAT_ANY);
hasPattern = true;
} else if (pCh == '?') {
pCh = PAT_ANY;
hasPattern = true;
} else if (pCh == '{') {
fileRegex.append('(');
pCh = '(';
curlyOpen++;
hasPattern = true;
} else if (pCh == ',' && curlyOpen > 0) {
fileRegex.append(")|");
pCh = '(';
} else if (pCh == '}' && curlyOpen > 0) {
// End of a group
curlyOpen--;
fileRegex.append(")");
pCh = ')';
} else if (pCh == '[' && setOpen == 0) {
setOpen++;
hasPattern = true;
} else if (pCh == '^' && setOpen > 0) {
} else if (pCh == '-' && setOpen > 0) {
// Character set range
setRange = true;
} else if (pCh == PAT_SET_CLOSE && setRange) {
// Incomplete character set range
error("Incomplete character set range", filePattern, i);
} else if (pCh == PAT_SET_CLOSE && setOpen > 0) {
// End of a character set
if (setOpen < 2)
error("Unexpected end of set", filePattern, i);
setOpen = 0;
} else if (setOpen > 0) {
// Normal character, or the end of a character set range
setOpen++;
setRange = false;
}
fileRegex.append(pCh);
}
// Check for a well-formed pattern
if (setOpen > 0 || setRange || curlyOpen > 0) {
// Incomplete character set or character range
error("Expecting set closure character or end of range, or }",
filePattern, len);
}
regex = Pattern.compile(fileRegex.toString());
}
boolean hasPattern() {
return hasPattern;
}
public boolean accept(Path path) {
return regex.matcher(path.getName()).matches() && userFilter.accept(path);
}
private void error(String s, String pattern, int pos) throws IOException {
throw new IOException("Illegal file pattern: "
+s+ " for glob "+ pattern + " at " + pos);
}
}
/** Return the current user's home directory in this filesystem. /** Return the current user's home directory in this filesystem.
* The default implementation returns "/user/$USER/". * The default implementation returns "/user/$USER/".
*/ */

View File

@ -0,0 +1,61 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs;
import java.util.regex.PatternSyntaxException;
import java.io.IOException;
// A class that could decide if a string matches the glob or not
class GlobFilter implements PathFilter {
private final static PathFilter DEFAULT_FILTER = new PathFilter() {
public boolean accept(Path file) {
return true;
}
};
private PathFilter userFilter = DEFAULT_FILTER;
private GlobPattern pattern;
GlobFilter(String filePattern) throws IOException {
init(filePattern, DEFAULT_FILTER);
}
GlobFilter(String filePattern, PathFilter filter) throws IOException {
init(filePattern, filter);
}
void init(String filePattern, PathFilter filter) throws IOException {
try {
userFilter = filter;
pattern = new GlobPattern(filePattern);
}
catch (PatternSyntaxException e) {
// Existing code expects IOException startWith("Illegal file pattern")
throw new IOException("Illegal file pattern: "+ e.getMessage(), e);
}
}
boolean hasPattern() {
return pattern.hasWildcard();
}
public boolean accept(Path path) {
return pattern.matches(path.getName()) && userFilter.accept(path);
}
}

View File

@ -0,0 +1,169 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
/**
* A class for POSIX glob pattern with brace expansions.
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class GlobPattern {
private static final char BACKSLASH = '\\';
private Pattern compiled;
private boolean hasWildcard = false;
/**
* Construct the glob pattern object with a glob pattern string
* @param globPattern the glob pattern string
*/
public GlobPattern(String globPattern) {
set(globPattern);
}
/**
* @return the compiled pattern
*/
public Pattern compiled() {
return compiled;
}
/**
* Compile glob pattern string
* @param globPattern the glob pattern
* @return the pattern object
*/
public static Pattern compile(String globPattern) {
return new GlobPattern(globPattern).compiled();
}
/**
* Match input against the compiled glob pattern
* @param s input chars
* @return true for successful matches
*/
public boolean matches(CharSequence s) {
return compiled.matcher(s).matches();
}
/**
* Set and compile a glob pattern
* @param glob the glob pattern string
*/
public void set(String glob) {
StringBuilder regex = new StringBuilder();
int setOpen = 0;
int curlyOpen = 0;
int len = glob.length();
hasWildcard = false;
for (int i = 0; i < len; i++) {
char c = glob.charAt(i);
switch (c) {
case BACKSLASH:
if (++i >= len) {
error("Missing escaped character", glob, i);
}
regex.append(c).append(glob.charAt(i));
continue;
case '.':
case '$':
case '(':
case ')':
case '|':
case '+':
// escape regex special chars that are not glob special chars
regex.append(BACKSLASH);
break;
case '*':
regex.append('.');
hasWildcard = true;
break;
case '?':
regex.append('.');
hasWildcard = true;
continue;
case '{': // start of a group
regex.append("(?:"); // non-capturing
curlyOpen++;
hasWildcard = true;
continue;
case ',':
regex.append(curlyOpen > 0 ? '|' : c);
continue;
case '}':
if (curlyOpen > 0) {
// end of a group
curlyOpen--;
regex.append(")");
continue;
}
break;
case '[':
if (setOpen > 0) {
error("Unclosed character class", glob, i);
}
setOpen++;
hasWildcard = true;
break;
case '^': // ^ inside [...] can be unescaped
if (setOpen == 0) {
regex.append(BACKSLASH);
}
break;
case '!': // [! needs to be translated to [^
regex.append(setOpen > 0 && '[' == glob.charAt(i - 1) ? '^' : '!');
continue;
case ']':
// Many set errors like [][] could not be easily detected here,
// as []], []-] and [-] are all valid POSIX glob and java regex.
// We'll just let the regex compiler do the real work.
setOpen = 0;
break;
default:
}
regex.append(c);
}
if (setOpen > 0) {
error("Unclosed character class", glob, len);
}
if (curlyOpen > 0) {
error("Unclosed group", glob, len);
}
compiled = Pattern.compile(regex.toString());
}
/**
* @return true if this is a wildcard pattern (with special chars)
*/
public boolean hasWildcard() {
return hasWildcard;
}
private static void error(String message, String pattern, int pos) {
throw new PatternSyntaxException(message, pattern, pos);
}
}

View File

@ -0,0 +1,74 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.fs;
import java.util.regex.PatternSyntaxException;
import org.junit.Test;
import static org.junit.Assert.*;
/**
* Tests for glob patterns
*/
public class TestGlobPattern {
private void assertMatch(boolean yes, String glob, String...input) {
GlobPattern pattern = new GlobPattern(glob);
for (String s : input) {
boolean result = pattern.matches(s);
assertTrue(glob +" should"+ (yes ? "" : " not") +" match "+ s,
yes ? result : !result);
}
}
private void shouldThrow(String... globs) {
for (String glob : globs) {
try {
GlobPattern.compile(glob);
}
catch (PatternSyntaxException e) {
e.printStackTrace();
continue;
}
assertTrue("glob "+ glob +" should throw", false);
}
}
@Test public void testValidPatterns() {
assertMatch(true, "*", "^$", "foo", "bar");
assertMatch(true, "?", "?", "^", "[", "]", "$");
assertMatch(true, "foo*", "foo", "food", "fool");
assertMatch(true, "f*d", "fud", "food");
assertMatch(true, "*d", "good", "bad");
assertMatch(true, "\\*\\?\\[\\{\\\\", "*?[{\\");
assertMatch(true, "[]^-]", "]", "-", "^");
assertMatch(true, "]", "]");
assertMatch(true, "^.$()|+", "^.$()|+");
assertMatch(true, "[^^]", ".", "$", "[", "]");
assertMatch(false, "[^^]", "^");
assertMatch(true, "[!!-]", "^", "?");
assertMatch(false, "[!!-]", "!", "-");
assertMatch(true, "{[12]*,[45]*,[78]*}", "1", "2!", "4", "42", "7", "7$");
assertMatch(false, "{[12]*,[45]*,[78]*}", "3", "6", "");
assertMatch(true, "}", "}");
}
@Test public void testInvalidPatterns() {
shouldThrow("[", "[[]]", "[][]", "{", "\\");
}
}