mirror of https://github.com/apache/lucene.git
unicode escapes for QueryParser: LUCENE-716
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@476679 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fd42f16265
commit
f5661f7c58
|
@ -58,6 +58,10 @@ New features
|
||||||
7. LUCENE-573: QueryParser now allows backslash escaping in
|
7. LUCENE-573: QueryParser now allows backslash escaping in
|
||||||
quoted terms and phrases. (Michael Busch via Yonik Seeley)
|
quoted terms and phrases. (Michael Busch via Yonik Seeley)
|
||||||
|
|
||||||
|
7. LUCENE-716: QueryParser now allows specification of unicode
|
||||||
|
characters in terms via a unicode escape of the form \uXXXX
|
||||||
|
(Michael Busch via Yonik Seeley)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
|
1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
|
||||||
|
|
|
@ -621,6 +621,9 @@ public class QueryParser implements QueryParserConstants {
|
||||||
* Returns a String where the escape char has been
|
* Returns a String where the escape char has been
|
||||||
* removed, or kept only once if there was a double escape.
|
* removed, or kept only once if there was a double escape.
|
||||||
*
|
*
|
||||||
|
* Supports escaped unicode characters, e. g. translates
|
||||||
|
* <code>A</code> to <code>A</code>.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
private String discardEscapeChar(String input) throws ParseException {
|
private String discardEscapeChar(String input) throws ParseException {
|
||||||
// Create char array to hold unescaped char sequence
|
// Create char array to hold unescaped char sequence
|
||||||
|
@ -635,12 +638,31 @@ public class QueryParser implements QueryParserConstants {
|
||||||
// an escape character
|
// an escape character
|
||||||
boolean lastCharWasEscapeChar = false;
|
boolean lastCharWasEscapeChar = false;
|
||||||
|
|
||||||
|
// The multiplier the current unicode digit must be multiplied with.
|
||||||
|
// E. g. the first digit must be multiplied with 16^3, the second with 16^2...
|
||||||
|
int codePointMultiplier = 0;
|
||||||
|
|
||||||
|
// Used to calculate the codepoint of the escaped unicode character
|
||||||
|
int codePoint = 0;
|
||||||
|
|
||||||
for (int i = 0; i < input.length(); i++) {
|
for (int i = 0; i < input.length(); i++) {
|
||||||
char curChar = input.charAt(i);
|
char curChar = input.charAt(i);
|
||||||
if (lastCharWasEscapeChar) {
|
if (codePointMultiplier > 0) {
|
||||||
|
codePoint += hexToInt(curChar) * codePointMultiplier;
|
||||||
|
codePointMultiplier >>>= 4;
|
||||||
|
if (codePointMultiplier == 0) {
|
||||||
|
length += Character.toChars(codePoint, output, length);
|
||||||
|
codePoint = 0;
|
||||||
|
}
|
||||||
|
} else if (lastCharWasEscapeChar) {
|
||||||
|
if (curChar == 'u') {
|
||||||
|
// found an escaped unicode character
|
||||||
|
codePointMultiplier = 16 * 16 * 16;
|
||||||
|
} else {
|
||||||
// this character was escaped
|
// this character was escaped
|
||||||
output[length] = curChar;
|
output[length] = curChar;
|
||||||
length++;
|
length++;
|
||||||
|
}
|
||||||
lastCharWasEscapeChar = false;
|
lastCharWasEscapeChar = false;
|
||||||
} else {
|
} else {
|
||||||
if (curChar == '\\') {
|
if (curChar == '\\') {
|
||||||
|
@ -652,12 +674,30 @@ public class QueryParser implements QueryParserConstants {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (codePointMultiplier > 0) {
|
||||||
|
throw new ParseException("Truncated unicode escape sequence.");
|
||||||
|
}
|
||||||
|
|
||||||
if (lastCharWasEscapeChar) {
|
if (lastCharWasEscapeChar) {
|
||||||
throw new ParseException("Term can not end with escape character.");
|
throw new ParseException("Term can not end with escape character.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return new String(output, 0, length);
|
return new String(output, 0, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the numeric value of the hexadecimal character */
|
||||||
|
private static final int hexToInt(char c) throws ParseException {
|
||||||
|
if ('0' <= c && c <= '9') {
|
||||||
|
return c - '0';
|
||||||
|
} else if ('a' <= c && c <= 'f'){
|
||||||
|
return c - 'a' + 10;
|
||||||
|
} else if ('A' <= c && c <= 'F') {
|
||||||
|
return c - 'A' + 10;
|
||||||
|
} else {
|
||||||
|
throw new ParseException("None-hex character in unicode escape sequence: " + c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a String where those characters that QueryParser
|
* Returns a String where those characters that QueryParser
|
||||||
* expects to be escaped are escaped by a preceding <code>\</code>.
|
* expects to be escaped are escaped by a preceding <code>\</code>.
|
||||||
|
|
|
@ -644,6 +644,10 @@ public class QueryParser {
|
||||||
/**
|
/**
|
||||||
* Returns a String where the escape char has been
|
* Returns a String where the escape char has been
|
||||||
* removed, or kept only once if there was a double escape.
|
* removed, or kept only once if there was a double escape.
|
||||||
|
*
|
||||||
|
* Supports escaped unicode characters, e. g. translates
|
||||||
|
* <code>\u0041</code> to <code>A</code>.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
private String discardEscapeChar(String input) throws ParseException {
|
private String discardEscapeChar(String input) throws ParseException {
|
||||||
// Create char array to hold unescaped char sequence
|
// Create char array to hold unescaped char sequence
|
||||||
|
@ -658,12 +662,31 @@ public class QueryParser {
|
||||||
// an escape character
|
// an escape character
|
||||||
boolean lastCharWasEscapeChar = false;
|
boolean lastCharWasEscapeChar = false;
|
||||||
|
|
||||||
|
// The multiplier the current unicode digit must be multiplied with.
|
||||||
|
// E. g. the first digit must be multiplied with 16^3, the second with 16^2...
|
||||||
|
int codePointMultiplier = 0;
|
||||||
|
|
||||||
|
// Used to calculate the codepoint of the escaped unicode character
|
||||||
|
int codePoint = 0;
|
||||||
|
|
||||||
for (int i = 0; i < input.length(); i++) {
|
for (int i = 0; i < input.length(); i++) {
|
||||||
char curChar = input.charAt(i);
|
char curChar = input.charAt(i);
|
||||||
if (lastCharWasEscapeChar) {
|
if (codePointMultiplier > 0) {
|
||||||
|
codePoint += hexToInt(curChar) * codePointMultiplier;
|
||||||
|
codePointMultiplier >>>= 4;
|
||||||
|
if (codePointMultiplier == 0) {
|
||||||
|
length += Character.toChars(codePoint, output, length);
|
||||||
|
codePoint = 0;
|
||||||
|
}
|
||||||
|
} else if (lastCharWasEscapeChar) {
|
||||||
|
if (curChar == 'u') {
|
||||||
|
// found an escaped unicode character
|
||||||
|
codePointMultiplier = 16 * 16 * 16;
|
||||||
|
} else {
|
||||||
// this character was escaped
|
// this character was escaped
|
||||||
output[length] = curChar;
|
output[length] = curChar;
|
||||||
length++;
|
length++;
|
||||||
|
}
|
||||||
lastCharWasEscapeChar = false;
|
lastCharWasEscapeChar = false;
|
||||||
} else {
|
} else {
|
||||||
if (curChar == '\\') {
|
if (curChar == '\\') {
|
||||||
|
@ -675,12 +698,30 @@ public class QueryParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (codePointMultiplier > 0) {
|
||||||
|
throw new ParseException("Truncated unicode escape sequence.");
|
||||||
|
}
|
||||||
|
|
||||||
if (lastCharWasEscapeChar) {
|
if (lastCharWasEscapeChar) {
|
||||||
throw new ParseException("Term can not end with escape character.");
|
throw new ParseException("Term can not end with escape character.");
|
||||||
}
|
}
|
||||||
|
|
||||||
return new String(output, 0, length);
|
return new String(output, 0, length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the numeric value of the hexadecimal character */
|
||||||
|
private static final int hexToInt(char c) throws ParseException {
|
||||||
|
if ('0' <= c && c <= '9') {
|
||||||
|
return c - '0';
|
||||||
|
} else if ('a' <= c && c <= 'f'){
|
||||||
|
return c - 'a' + 10;
|
||||||
|
} else if ('A' <= c && c <= 'F') {
|
||||||
|
return c - 'A' + 10;
|
||||||
|
} else {
|
||||||
|
throw new ParseException("None-hex character in unicode escape sequence: " + c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a String where those characters that QueryParser
|
* Returns a String where those characters that QueryParser
|
||||||
* expects to be escaped are escaped by a preceding <code>\</code>.
|
* expects to be escaped are escaped by a preceding <code>\</code>.
|
||||||
|
|
|
@ -486,6 +486,28 @@ public class TestQueryParser extends TestCase {
|
||||||
assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\"");
|
assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\"");
|
||||||
|
|
||||||
assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
|
assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
|
||||||
|
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertQueryEquals("XY\\", a, "XYZ");
|
||||||
|
fail("ParseException expected, not thrown");
|
||||||
|
} catch (ParseException expected) {}
|
||||||
|
|
||||||
|
// test unicode escaping
|
||||||
|
assertQueryEquals("a\\u0062c", a, "abc");
|
||||||
|
assertQueryEquals("XY\\u005a", a, "XYZ");
|
||||||
|
assertQueryEquals("XY\\u005A", a, "XYZ");
|
||||||
|
assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "\"a \\(b\" c\"");
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertQueryEquals("XY\\u005G", a, "XYZ");
|
||||||
|
fail("ParseException expected, not thrown");
|
||||||
|
} catch (ParseException expected) {}
|
||||||
|
|
||||||
|
try {
|
||||||
|
assertQueryEquals("XY\\u005", a, "XYZ");
|
||||||
|
fail("ParseException expected, not thrown");
|
||||||
|
} catch (ParseException expected) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testQueryStringEscaping() throws Exception {
|
public void testQueryStringEscaping() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue