diff --git a/CHANGES.txt b/CHANGES.txt index 27d9d97f133..0aca265fc7f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -57,6 +57,11 @@ Trunk (unreleased changes) HADOOP-6747. TestNetUtils fails on Mac OS X. (Todd Lipcon via jghoman) + HADOOP-6787. Factor out glob pattern code from FileContext and + Filesystem. Also fix bugs identified in HADOOP-6618 and make the + glob pattern code less restrictive and more POSIX standard + compliant. (Luke Lu via eli) + Release 0.21.0 - Unreleased INCOMPATIBLE CHANGES diff --git a/src/java/org/apache/hadoop/fs/FileContext.java b/src/java/org/apache/hadoop/fs/FileContext.java index 166dfa24bc7..4f921a3b448 100644 --- a/src/java/org/apache/hadoop/fs/FileContext.java +++ b/src/java/org/apache/hadoop/fs/FileContext.java @@ -1874,131 +1874,6 @@ public final class FileContext { } } - /* A class that could decide if a string matches the glob or not */ - private static class GlobFilter implements PathFilter { - private PathFilter userFilter = DEFAULT_FILTER; - private Pattern regex; - private boolean hasPattern = false; - - /** Default pattern character: Escape any special meaning. */ - private static final char PAT_ESCAPE = '\\'; - /** Default pattern character: Any single character. */ - private static final char PAT_ANY = '.'; - /** Default pattern character: Character set close. */ - private static final char PAT_SET_CLOSE = ']'; - - GlobFilter(final String filePattern) { - setRegex(filePattern); - } - - GlobFilter(final String filePattern, final PathFilter filter) { - userFilter = filter; - setRegex(filePattern); - } - - private boolean isJavaRegexSpecialChar(char pChar) { - return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' || - pChar == '|' || pChar == '+'; - } - - void setRegex(String filePattern) { - int len; - int setOpen; - int curlyOpen; - boolean setRange; - - StringBuilder fileRegex = new StringBuilder(); - - // Validate the pattern - len = filePattern.length(); - if (len == 0) { - return; - } - - setOpen = 0; - setRange = false; - curlyOpen = 0; - - for (int i = 0; i < len; i++) { - char pCh; - - // Examine a single pattern character - pCh = filePattern.charAt(i); - if (pCh == PAT_ESCAPE) { - fileRegex.append(pCh); - i++; - if (i >= len) { - error("An escaped character does not present", filePattern, i); - } - pCh = filePattern.charAt(i); - } else if (isJavaRegexSpecialChar(pCh)) { - fileRegex.append(PAT_ESCAPE); - } else if (pCh == '*') { - fileRegex.append(PAT_ANY); - hasPattern = true; - } else if (pCh == '?') { - pCh = PAT_ANY; - hasPattern = true; - } else if (pCh == '{') { - fileRegex.append('('); - pCh = '('; - curlyOpen++; - hasPattern = true; - } else if (pCh == ',' && curlyOpen > 0) { - fileRegex.append(")|"); - pCh = '('; - } else if (pCh == '}' && curlyOpen > 0) { - // End of a group - curlyOpen--; - fileRegex.append(")"); - pCh = ')'; - } else if (pCh == '[' && setOpen == 0) { - setOpen++; - hasPattern = true; - } else if (pCh == '^' && setOpen > 0) { - } else if (pCh == '-' && setOpen > 0) { - // Character set range - setRange = true; - } else if (pCh == PAT_SET_CLOSE && setRange) { - // Incomplete character set range - error("Incomplete character set range", filePattern, i); - } else if (pCh == PAT_SET_CLOSE && setOpen > 0) { - // End of a character set - if (setOpen < 2) { - error("Unexpected end of set", filePattern, i); - } - setOpen = 0; - } else if (setOpen > 0) { - // Normal character, or the end of a character set range - setOpen++; - setRange = false; - } - fileRegex.append(pCh); - } - - // Check for a well-formed pattern - if (setOpen > 0 || setRange || curlyOpen > 0) { - // Incomplete character set or character range - error("Expecting set closure character or end of range, or }", - filePattern, len); - } - regex = Pattern.compile(fileRegex.toString()); - } - - boolean hasPattern() { - return hasPattern; - } - - public boolean accept(final Path path) { - return regex.matcher(path.getName()).matches() && userFilter.accept(path); - } - - private void error(final String s, final String pattern, final int pos) { - throw new HadoopIllegalArgumentException("Illegal file pattern: " + s - + " for glob " + pattern + " at " + pos); - } - } - /** * Check if copying srcName to dst would overwrite an existing * file or directory. diff --git a/src/java/org/apache/hadoop/fs/FileSystem.java b/src/java/org/apache/hadoop/fs/FileSystem.java index ccaf0dfd629..63bdca14644 100644 --- a/src/java/org/apache/hadoop/fs/FileSystem.java +++ b/src/java/org/apache/hadoop/fs/FileSystem.java @@ -1369,127 +1369,6 @@ public abstract class FileSystem extends Configured implements Closeable { return globPathsLevel(parents, filePattern, level + 1, hasGlob); } - /* A class that could decide if a string matches the glob or not */ - private static class GlobFilter implements PathFilter { - private PathFilter userFilter = DEFAULT_FILTER; - private Pattern regex; - private boolean hasPattern = false; - - /** Default pattern character: Escape any special meaning. */ - private static final char PAT_ESCAPE = '\\'; - /** Default pattern character: Any single character. */ - private static final char PAT_ANY = '.'; - /** Default pattern character: Character set close. */ - private static final char PAT_SET_CLOSE = ']'; - - GlobFilter(String filePattern) throws IOException { - setRegex(filePattern); - } - - GlobFilter(String filePattern, PathFilter filter) throws IOException { - userFilter = filter; - setRegex(filePattern); - } - - private boolean isJavaRegexSpecialChar(char pChar) { - return pChar == '.' || pChar == '$' || pChar == '(' || pChar == ')' || - pChar == '|' || pChar == '+'; - } - void setRegex(String filePattern) throws IOException { - int len; - int setOpen; - int curlyOpen; - boolean setRange; - - StringBuilder fileRegex = new StringBuilder(); - - // Validate the pattern - len = filePattern.length(); - if (len == 0) - return; - - setOpen = 0; - setRange = false; - curlyOpen = 0; - - for (int i = 0; i < len; i++) { - char pCh; - - // Examine a single pattern character - pCh = filePattern.charAt(i); - if (pCh == PAT_ESCAPE) { - fileRegex.append(pCh); - i++; - if (i >= len) - error("An escaped character does not present", filePattern, i); - pCh = filePattern.charAt(i); - } else if (isJavaRegexSpecialChar(pCh)) { - fileRegex.append(PAT_ESCAPE); - } else if (pCh == '*') { - fileRegex.append(PAT_ANY); - hasPattern = true; - } else if (pCh == '?') { - pCh = PAT_ANY; - hasPattern = true; - } else if (pCh == '{') { - fileRegex.append('('); - pCh = '('; - curlyOpen++; - hasPattern = true; - } else if (pCh == ',' && curlyOpen > 0) { - fileRegex.append(")|"); - pCh = '('; - } else if (pCh == '}' && curlyOpen > 0) { - // End of a group - curlyOpen--; - fileRegex.append(")"); - pCh = ')'; - } else if (pCh == '[' && setOpen == 0) { - setOpen++; - hasPattern = true; - } else if (pCh == '^' && setOpen > 0) { - } else if (pCh == '-' && setOpen > 0) { - // Character set range - setRange = true; - } else if (pCh == PAT_SET_CLOSE && setRange) { - // Incomplete character set range - error("Incomplete character set range", filePattern, i); - } else if (pCh == PAT_SET_CLOSE && setOpen > 0) { - // End of a character set - if (setOpen < 2) - error("Unexpected end of set", filePattern, i); - setOpen = 0; - } else if (setOpen > 0) { - // Normal character, or the end of a character set range - setOpen++; - setRange = false; - } - fileRegex.append(pCh); - } - - // Check for a well-formed pattern - if (setOpen > 0 || setRange || curlyOpen > 0) { - // Incomplete character set or character range - error("Expecting set closure character or end of range, or }", - filePattern, len); - } - regex = Pattern.compile(fileRegex.toString()); - } - - boolean hasPattern() { - return hasPattern; - } - - public boolean accept(Path path) { - return regex.matcher(path.getName()).matches() && userFilter.accept(path); - } - - private void error(String s, String pattern, int pos) throws IOException { - throw new IOException("Illegal file pattern: " - +s+ " for glob "+ pattern + " at " + pos); - } - } - /** Return the current user's home directory in this filesystem. * The default implementation returns "/user/$USER/". */ diff --git a/src/java/org/apache/hadoop/fs/GlobFilter.java b/src/java/org/apache/hadoop/fs/GlobFilter.java new file mode 100644 index 00000000000..b5c04f004f2 --- /dev/null +++ b/src/java/org/apache/hadoop/fs/GlobFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.util.regex.PatternSyntaxException; +import java.io.IOException; + + // A class that could decide if a string matches the glob or not +class GlobFilter implements PathFilter { + private final static PathFilter DEFAULT_FILTER = new PathFilter() { + public boolean accept(Path file) { + return true; + } + }; + + private PathFilter userFilter = DEFAULT_FILTER; + private GlobPattern pattern; + + GlobFilter(String filePattern) throws IOException { + init(filePattern, DEFAULT_FILTER); + } + + GlobFilter(String filePattern, PathFilter filter) throws IOException { + init(filePattern, filter); + } + + void init(String filePattern, PathFilter filter) throws IOException { + try { + userFilter = filter; + pattern = new GlobPattern(filePattern); + } + catch (PatternSyntaxException e) { + // Existing code expects IOException startWith("Illegal file pattern") + throw new IOException("Illegal file pattern: "+ e.getMessage(), e); + } + } + + boolean hasPattern() { + return pattern.hasWildcard(); + } + + public boolean accept(Path path) { + return pattern.matches(path.getName()) && userFilter.accept(path); + } +} diff --git a/src/java/org/apache/hadoop/fs/GlobPattern.java b/src/java/org/apache/hadoop/fs/GlobPattern.java new file mode 100644 index 00000000000..4be5b1cfb4d --- /dev/null +++ b/src/java/org/apache/hadoop/fs/GlobPattern.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; + +/** + * A class for POSIX glob pattern with brace expansions. + */ +@InterfaceAudience.Private +@InterfaceStability.Evolving +public class GlobPattern { + private static final char BACKSLASH = '\\'; + private Pattern compiled; + private boolean hasWildcard = false; + + /** + * Construct the glob pattern object with a glob pattern string + * @param globPattern the glob pattern string + */ + public GlobPattern(String globPattern) { + set(globPattern); + } + + /** + * @return the compiled pattern + */ + public Pattern compiled() { + return compiled; + } + + /** + * Compile glob pattern string + * @param globPattern the glob pattern + * @return the pattern object + */ + public static Pattern compile(String globPattern) { + return new GlobPattern(globPattern).compiled(); + } + + /** + * Match input against the compiled glob pattern + * @param s input chars + * @return true for successful matches + */ + public boolean matches(CharSequence s) { + return compiled.matcher(s).matches(); + } + + /** + * Set and compile a glob pattern + * @param glob the glob pattern string + */ + public void set(String glob) { + StringBuilder regex = new StringBuilder(); + int setOpen = 0; + int curlyOpen = 0; + int len = glob.length(); + hasWildcard = false; + + for (int i = 0; i < len; i++) { + char c = glob.charAt(i); + + switch (c) { + case BACKSLASH: + if (++i >= len) { + error("Missing escaped character", glob, i); + } + regex.append(c).append(glob.charAt(i)); + continue; + case '.': + case '$': + case '(': + case ')': + case '|': + case '+': + // escape regex special chars that are not glob special chars + regex.append(BACKSLASH); + break; + case '*': + regex.append('.'); + hasWildcard = true; + break; + case '?': + regex.append('.'); + hasWildcard = true; + continue; + case '{': // start of a group + regex.append("(?:"); // non-capturing + curlyOpen++; + hasWildcard = true; + continue; + case ',': + regex.append(curlyOpen > 0 ? '|' : c); + continue; + case '}': + if (curlyOpen > 0) { + // end of a group + curlyOpen--; + regex.append(")"); + continue; + } + break; + case '[': + if (setOpen > 0) { + error("Unclosed character class", glob, i); + } + setOpen++; + hasWildcard = true; + break; + case '^': // ^ inside [...] can be unescaped + if (setOpen == 0) { + regex.append(BACKSLASH); + } + break; + case '!': // [! needs to be translated to [^ + regex.append(setOpen > 0 && '[' == glob.charAt(i - 1) ? '^' : '!'); + continue; + case ']': + // Many set errors like [][] could not be easily detected here, + // as []], []-] and [-] are all valid POSIX glob and java regex. + // We'll just let the regex compiler do the real work. + setOpen = 0; + break; + default: + } + regex.append(c); + } + + if (setOpen > 0) { + error("Unclosed character class", glob, len); + } + if (curlyOpen > 0) { + error("Unclosed group", glob, len); + } + compiled = Pattern.compile(regex.toString()); + } + + /** + * @return true if this is a wildcard pattern (with special chars) + */ + public boolean hasWildcard() { + return hasWildcard; + } + + private static void error(String message, String pattern, int pos) { + throw new PatternSyntaxException(message, pattern, pos); + } +} diff --git a/src/test/core/org/apache/hadoop/fs/TestGlobPattern.java b/src/test/core/org/apache/hadoop/fs/TestGlobPattern.java new file mode 100644 index 00000000000..0fffc47b33b --- /dev/null +++ b/src/test/core/org/apache/hadoop/fs/TestGlobPattern.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.fs; + +import java.util.regex.PatternSyntaxException; +import org.junit.Test; +import static org.junit.Assert.*; + +/** + * Tests for glob patterns + */ +public class TestGlobPattern { + private void assertMatch(boolean yes, String glob, String...input) { + GlobPattern pattern = new GlobPattern(glob); + + for (String s : input) { + boolean result = pattern.matches(s); + assertTrue(glob +" should"+ (yes ? "" : " not") +" match "+ s, + yes ? result : !result); + } + } + + private void shouldThrow(String... globs) { + for (String glob : globs) { + try { + GlobPattern.compile(glob); + } + catch (PatternSyntaxException e) { + e.printStackTrace(); + continue; + } + assertTrue("glob "+ glob +" should throw", false); + } + } + + @Test public void testValidPatterns() { + assertMatch(true, "*", "^$", "foo", "bar"); + assertMatch(true, "?", "?", "^", "[", "]", "$"); + assertMatch(true, "foo*", "foo", "food", "fool"); + assertMatch(true, "f*d", "fud", "food"); + assertMatch(true, "*d", "good", "bad"); + assertMatch(true, "\\*\\?\\[\\{\\\\", "*?[{\\"); + assertMatch(true, "[]^-]", "]", "-", "^"); + assertMatch(true, "]", "]"); + assertMatch(true, "^.$()|+", "^.$()|+"); + assertMatch(true, "[^^]", ".", "$", "[", "]"); + assertMatch(false, "[^^]", "^"); + assertMatch(true, "[!!-]", "^", "?"); + assertMatch(false, "[!!-]", "!", "-"); + assertMatch(true, "{[12]*,[45]*,[78]*}", "1", "2!", "4", "42", "7", "7$"); + assertMatch(false, "{[12]*,[45]*,[78]*}", "3", "6", "9ß"); + assertMatch(true, "}", "}"); + } + + @Test public void testInvalidPatterns() { + shouldThrow("[", "[[]]", "[][]", "{", "\\"); + } +}