diff --git a/CHANGES.txt b/CHANGES.txt
index 88aeb2f883f..35e42c73fa4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -58,6 +58,10 @@ New features
7. LUCENE-573: QueryParser now allows backslash escaping in
quoted terms and phrases. (Michael Busch via Yonik Seeley)
+ 7. LUCENE-716: QueryParser now allows specification of unicode
+ characters in terms via a unicode escape of the form \uXXXX
+ (Michael Busch via Yonik Seeley)
+
API Changes
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.java b/src/java/org/apache/lucene/queryParser/QueryParser.java
index 9ef3044c9d3..804ae94099a 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.java
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.java
@@ -621,6 +621,9 @@ public class QueryParser implements QueryParserConstants {
* Returns a String where the escape char has been
* removed, or kept only once if there was a double escape.
*
+ * Supports escaped unicode characters, e. g. translates
+ * A
to A
.
+ *
*/
private String discardEscapeChar(String input) throws ParseException {
// Create char array to hold unescaped char sequence
@@ -635,12 +638,31 @@ public class QueryParser implements QueryParserConstants {
// an escape character
boolean lastCharWasEscapeChar = false;
+ // The multiplier the current unicode digit must be multiplied with.
+ // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+ int codePointMultiplier = 0;
+
+ // Used to calculate the codepoint of the escaped unicode character
+ int codePoint = 0;
+
for (int i = 0; i < input.length(); i++) {
char curChar = input.charAt(i);
- if (lastCharWasEscapeChar) {
- // this character was escaped
- output[length] = curChar;
- length++;
+ if (codePointMultiplier > 0) {
+ codePoint += hexToInt(curChar) * codePointMultiplier;
+ codePointMultiplier >>>= 4;
+ if (codePointMultiplier == 0) {
+ length += Character.toChars(codePoint, output, length);
+ codePoint = 0;
+ }
+ } else if (lastCharWasEscapeChar) {
+ if (curChar == 'u') {
+ // found an escaped unicode character
+ codePointMultiplier = 16 * 16 * 16;
+ } else {
+ // this character was escaped
+ output[length] = curChar;
+ length++;
+ }
lastCharWasEscapeChar = false;
} else {
if (curChar == '\\') {
@@ -652,12 +674,30 @@ public class QueryParser implements QueryParserConstants {
}
}
+ if (codePointMultiplier > 0) {
+ throw new ParseException("Truncated unicode escape sequence.");
+ }
+
if (lastCharWasEscapeChar) {
throw new ParseException("Term can not end with escape character.");
}
return new String(output, 0, length);
}
+
+ /** Returns the numeric value of the hexadecimal character */
+ private static final int hexToInt(char c) throws ParseException {
+ if ('0' <= c && c <= '9') {
+ return c - '0';
+ } else if ('a' <= c && c <= 'f'){
+ return c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ return c - 'A' + 10;
+ } else {
+ throw new ParseException("None-hex character in unicode escape sequence: " + c);
+ }
+ }
+
/**
* Returns a String where those characters that QueryParser
* expects to be escaped are escaped by a preceding \
.
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj
index 850bc669b48..eaa1a6c1200 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@@ -644,6 +644,10 @@ public class QueryParser {
/**
* Returns a String where the escape char has been
* removed, or kept only once if there was a double escape.
+ *
+ * Supports escaped unicode characters, e. g. translates
+ * \u0041
to A
.
+ *
*/
private String discardEscapeChar(String input) throws ParseException {
// Create char array to hold unescaped char sequence
@@ -658,12 +662,31 @@ public class QueryParser {
// an escape character
boolean lastCharWasEscapeChar = false;
+ // The multiplier the current unicode digit must be multiplied with.
+ // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
+ int codePointMultiplier = 0;
+
+ // Used to calculate the codepoint of the escaped unicode character
+ int codePoint = 0;
+
for (int i = 0; i < input.length(); i++) {
char curChar = input.charAt(i);
- if (lastCharWasEscapeChar) {
- // this character was escaped
- output[length] = curChar;
- length++;
+ if (codePointMultiplier > 0) {
+ codePoint += hexToInt(curChar) * codePointMultiplier;
+ codePointMultiplier >>>= 4;
+ if (codePointMultiplier == 0) {
+ length += Character.toChars(codePoint, output, length);
+ codePoint = 0;
+ }
+ } else if (lastCharWasEscapeChar) {
+ if (curChar == 'u') {
+ // found an escaped unicode character
+ codePointMultiplier = 16 * 16 * 16;
+ } else {
+ // this character was escaped
+ output[length] = curChar;
+ length++;
+ }
lastCharWasEscapeChar = false;
} else {
if (curChar == '\\') {
@@ -675,12 +698,30 @@ public class QueryParser {
}
}
+ if (codePointMultiplier > 0) {
+ throw new ParseException("Truncated unicode escape sequence.");
+ }
+
if (lastCharWasEscapeChar) {
throw new ParseException("Term can not end with escape character.");
}
return new String(output, 0, length);
}
+
+ /** Returns the numeric value of the hexadecimal character */
+ private static final int hexToInt(char c) throws ParseException {
+ if ('0' <= c && c <= '9') {
+ return c - '0';
+ } else if ('a' <= c && c <= 'f'){
+ return c - 'a' + 10;
+ } else if ('A' <= c && c <= 'F') {
+ return c - 'A' + 10;
+ } else {
+ throw new ParseException("None-hex character in unicode escape sequence: " + c);
+ }
+ }
+
/**
* Returns a String where those characters that QueryParser
* expects to be escaped are escaped by a preceding \
.
diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
index 0447523e373..c932a18105f 100644
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@@ -486,6 +486,28 @@ public class TestQueryParser extends TestCase {
assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\"");
assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
+
+
+ try {
+ assertQueryEquals("XY\\", a, "XYZ");
+ fail("ParseException expected, not thrown");
+ } catch (ParseException expected) {}
+
+ // test unicode escaping
+ assertQueryEquals("a\\u0062c", a, "abc");
+ assertQueryEquals("XY\\u005a", a, "XYZ");
+ assertQueryEquals("XY\\u005A", a, "XYZ");
+ assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "\"a \\(b\" c\"");
+
+ try {
+ assertQueryEquals("XY\\u005G", a, "XYZ");
+ fail("ParseException expected, not thrown");
+ } catch (ParseException expected) {}
+
+ try {
+ assertQueryEquals("XY\\u005", a, "XYZ");
+ fail("ParseException expected, not thrown");
+ } catch (ParseException expected) {}
}
public void testQueryStringEscaping() throws Exception {