diff --git a/CHANGES.txt b/CHANGES.txt index 88aeb2f883f..35e42c73fa4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -58,6 +58,10 @@ New features 7. LUCENE-573: QueryParser now allows backslash escaping in quoted terms and phrases. (Michael Busch via Yonik Seeley) + 7. LUCENE-716: QueryParser now allows specification of unicode + characters in terms via a unicode escape of the form \uXXXX + (Michael Busch via Yonik Seeley) + API Changes 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.java b/src/java/org/apache/lucene/queryParser/QueryParser.java index 9ef3044c9d3..804ae94099a 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.java +++ b/src/java/org/apache/lucene/queryParser/QueryParser.java @@ -621,6 +621,9 @@ public class QueryParser implements QueryParserConstants { * Returns a String where the escape char has been * removed, or kept only once if there was a double escape. * + * Supports escaped unicode characters, e. g. translates + * A to A. + * */ private String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence @@ -635,12 +638,31 @@ public class QueryParser implements QueryParserConstants { // an escape character boolean lastCharWasEscapeChar = false; + // The multiplier the current unicode digit must be multiplied with. + // E. g. the first digit must be multiplied with 16^3, the second with 16^2... + int codePointMultiplier = 0; + + // Used to calculate the codepoint of the escaped unicode character + int codePoint = 0; + for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); - if (lastCharWasEscapeChar) { - // this character was escaped - output[length] = curChar; - length++; + if (codePointMultiplier > 0) { + codePoint += hexToInt(curChar) * codePointMultiplier; + codePointMultiplier >>>= 4; + if (codePointMultiplier == 0) { + length += Character.toChars(codePoint, output, length); + codePoint = 0; + } + } else if (lastCharWasEscapeChar) { + if (curChar == 'u') { + // found an escaped unicode character + codePointMultiplier = 16 * 16 * 16; + } else { + // this character was escaped + output[length] = curChar; + length++; + } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { @@ -652,12 +674,30 @@ public class QueryParser implements QueryParserConstants { } } + if (codePointMultiplier > 0) { + throw new ParseException("Truncated unicode escape sequence."); + } + if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } + + /** Returns the numeric value of the hexadecimal character */ + private static final int hexToInt(char c) throws ParseException { + if ('0' <= c && c <= '9') { + return c - '0'; + } else if ('a' <= c && c <= 'f'){ + return c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + return c - 'A' + 10; + } else { + throw new ParseException("None-hex character in unicode escape sequence: " + c); + } + } + /** * Returns a String where those characters that QueryParser * expects to be escaped are escaped by a preceding \. diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj index 850bc669b48..eaa1a6c1200 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -644,6 +644,10 @@ public class QueryParser { /** * Returns a String where the escape char has been * removed, or kept only once if there was a double escape. + * + * Supports escaped unicode characters, e. g. translates + * \u0041 to A. + * */ private String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence @@ -658,12 +662,31 @@ public class QueryParser { // an escape character boolean lastCharWasEscapeChar = false; + // The multiplier the current unicode digit must be multiplied with. + // E. g. the first digit must be multiplied with 16^3, the second with 16^2... + int codePointMultiplier = 0; + + // Used to calculate the codepoint of the escaped unicode character + int codePoint = 0; + for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); - if (lastCharWasEscapeChar) { - // this character was escaped - output[length] = curChar; - length++; + if (codePointMultiplier > 0) { + codePoint += hexToInt(curChar) * codePointMultiplier; + codePointMultiplier >>>= 4; + if (codePointMultiplier == 0) { + length += Character.toChars(codePoint, output, length); + codePoint = 0; + } + } else if (lastCharWasEscapeChar) { + if (curChar == 'u') { + // found an escaped unicode character + codePointMultiplier = 16 * 16 * 16; + } else { + // this character was escaped + output[length] = curChar; + length++; + } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { @@ -675,12 +698,30 @@ public class QueryParser { } } + if (codePointMultiplier > 0) { + throw new ParseException("Truncated unicode escape sequence."); + } + if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } + + /** Returns the numeric value of the hexadecimal character */ + private static final int hexToInt(char c) throws ParseException { + if ('0' <= c && c <= '9') { + return c - '0'; + } else if ('a' <= c && c <= 'f'){ + return c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + return c - 'A' + 10; + } else { + throw new ParseException("None-hex character in unicode escape sequence: " + c); + } + } + /** * Returns a String where those characters that QueryParser * expects to be escaped are escaped by a preceding \. diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java index 0447523e373..c932a18105f 100644 --- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java @@ -486,6 +486,28 @@ public class TestQueryParser extends TestCase { assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\""); assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt"); + + + try { + assertQueryEquals("XY\\", a, "XYZ"); + fail("ParseException expected, not thrown"); + } catch (ParseException expected) {} + + // test unicode escaping + assertQueryEquals("a\\u0062c", a, "abc"); + assertQueryEquals("XY\\u005a", a, "XYZ"); + assertQueryEquals("XY\\u005A", a, "XYZ"); + assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "\"a \\(b\" c\""); + + try { + assertQueryEquals("XY\\u005G", a, "XYZ"); + fail("ParseException expected, not thrown"); + } catch (ParseException expected) {} + + try { + assertQueryEquals("XY\\u005", a, "XYZ"); + fail("ParseException expected, not thrown"); + } catch (ParseException expected) {} } public void testQueryStringEscaping() throws Exception {