unicode escapes for QueryParser: LUCENE-716

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@476679 13f79535-47bb-0310-9956-ffa450edef68
2006-11-19 01:34:10 +00:00 · 2006-11-19 01:34:10 +00:00 · f5661f7c58
parent fd42f16265
commit f5661f7c58
4 changed files with 115 additions and 8 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -58,6 +58,10 @@ New features
 7. LUCENE-573: QueryParser now allows backslash escaping in
    quoted terms and phrases. (Michael Busch via Yonik Seeley)
 7. LUCENE-716: QueryParser now allows specification of unicode
    characters in terms via a unicode escape of the form \uXXXX
    (Michael Busch via Yonik Seeley)
 API Changes
 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow
--- a/src/java/org/apache/lucene/queryParser/QueryParser.java
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.java
@ -621,6 +621,9 @@ public class QueryParser implements QueryParserConstants {
   * Returns a String where the escape char has been
   * removed, or kept only once if there was a double escape.
   * 
   * Supports escaped unicode characters, e. g. translates
   * <code>A</code> to <code>A</code>.
   * 
   */
  private String discardEscapeChar(String input) throws ParseException {
    // Create char array to hold unescaped char sequence
@ -635,12 +638,31 @@ public class QueryParser implements QueryParserConstants {
    // an escape character
    boolean lastCharWasEscapeChar = false;
    // The multiplier the current unicode digit must be multiplied with.
    // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
    int codePointMultiplier = 0;
    // Used to calculate the codepoint of the escaped unicode character
    int codePoint = 0;
    for (int i = 0; i < input.length(); i++) {
      char curChar = input.charAt(i);
-      if (lastCharWasEscapeChar) {
+      if (codePointMultiplier > 0) {
-        // this character was escaped
+        codePoint += hexToInt(curChar) * codePointMultiplier;
-        output[length] = curChar;
+        codePointMultiplier >>>= 4;
-        length++;
+        if (codePointMultiplier == 0) {
          length += Character.toChars(codePoint, output, length);
          codePoint = 0;
        }
      } else if (lastCharWasEscapeChar) {
        if (curChar == 'u') {
          // found an escaped unicode character
          codePointMultiplier = 16 * 16 * 16;
        } else {
          // this character was escaped
          output[length] = curChar;
          length++;
        }
        lastCharWasEscapeChar = false;
      } else {
        if (curChar == '\\') {
@ -652,12 +674,30 @@ public class QueryParser implements QueryParserConstants {
      }
    }
    if (codePointMultiplier > 0) {
      throw new ParseException("Truncated unicode escape sequence.");
    }
    if (lastCharWasEscapeChar) {
      throw new ParseException("Term can not end with escape character.");
    }
    return new String(output, 0, length);
  }
  /** Returns the numeric value of the hexadecimal character */
  private static final int hexToInt(char c) throws ParseException {
    if ('0' <= c && c <= '9') {
      return c - '0';
    } else if ('a' <= c && c <= 'f'){
      return c - 'a' + 10;
    } else if ('A' <= c && c <= 'F') {
      return c - 'A' + 10;
    } else {
      throw new ParseException("None-hex character in unicode escape sequence: " + c);
    }
  }
  /**
   * Returns a String where those characters that QueryParser
   * expects to be escaped are escaped by a preceding <code>\</code>.
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@ -644,6 +644,10 @@ public class QueryParser {
  /**
   * Returns a String where the escape char has been
   * removed, or kept only once if there was a double escape.
   * 
   * Supports escaped unicode characters, e. g. translates
   * <code>\u0041</code> to <code>A</code>.
   * 
   */
  private String discardEscapeChar(String input) throws ParseException {
    // Create char array to hold unescaped char sequence
@ -658,12 +662,31 @@ public class QueryParser {
    // an escape character
    boolean lastCharWasEscapeChar = false;
    // The multiplier the current unicode digit must be multiplied with.
    // E. g. the first digit must be multiplied with 16^3, the second with 16^2...
    int codePointMultiplier = 0;
    // Used to calculate the codepoint of the escaped unicode character
    int codePoint = 0;
    for (int i = 0; i < input.length(); i++) {
      char curChar = input.charAt(i);
-      if (lastCharWasEscapeChar) {
+      if (codePointMultiplier > 0) {
-        // this character was escaped
+        codePoint += hexToInt(curChar) * codePointMultiplier;
-        output[length] = curChar;    
+        codePointMultiplier >>>= 4;
-        length++;
+        if (codePointMultiplier == 0) {
          length += Character.toChars(codePoint, output, length);
          codePoint = 0;
        }
      } else if (lastCharWasEscapeChar) {
        if (curChar == 'u') {
          // found an escaped unicode character
          codePointMultiplier = 16 * 16 * 16;
        } else { 
          // this character was escaped
          output[length] = curChar;    
          length++;
        }
        lastCharWasEscapeChar = false;
      } else {
        if (curChar == '\\') {
@ -675,12 +698,30 @@ public class QueryParser {
      }
    }
    if (codePointMultiplier > 0) {
      throw new ParseException("Truncated unicode escape sequence.");
    }
    if (lastCharWasEscapeChar) {
      throw new ParseException("Term can not end with escape character.");
    }
    return new String(output, 0, length);
  }
  /** Returns the numeric value of the hexadecimal character */
  private static final int hexToInt(char c) throws ParseException {
    if ('0' <= c && c <= '9') {
      return c - '0';
    } else if ('a' <= c && c <= 'f'){
      return c - 'a' + 10;
    } else if ('A' <= c && c <= 'F') {
      return c - 'A' + 10;
    } else {
      throw new ParseException("None-hex character in unicode escape sequence: " + c);
    }
  }
  /**
   * Returns a String where those characters that QueryParser
   * expects to be escaped are escaped by a preceding <code>\</code>.
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@ -486,6 +486,28 @@ public class TestQueryParser extends TestCase {
    assertQueryEquals("\"a \\+b c d\"", a, "\"a +b c d\"");
    assertQueryEquals("c\\:\\\\temp\\\\\\~foo.txt", a, "c:\\temp\\~foo.txt");
    try {
        assertQueryEquals("XY\\", a, "XYZ");
        fail("ParseException expected, not thrown");
    } catch (ParseException expected) {}
    // test unicode escaping
    assertQueryEquals("a\\u0062c", a, "abc");
    assertQueryEquals("XY\\u005a", a, "XYZ");
    assertQueryEquals("XY\\u005A", a, "XYZ");
    assertQueryEquals("\"a \\\\\\u0028\\u0062\\\" c\"", a, "\"a \\(b\" c\"");
    try {
        assertQueryEquals("XY\\u005G", a, "XYZ");
        fail("ParseException expected, not thrown");
    } catch (ParseException expected) {}
    try {
        assertQueryEquals("XY\\u005", a, "XYZ");
        fail("ParseException expected, not thrown");
    } catch (ParseException expected) {}
  }
  public void testQueryStringEscaping() throws Exception {