From 584d2b394b02f65934554fe2cb9dfec556f32ffa Mon Sep 17 00:00:00 2001
From: Brian Goetz <briangoetz@apache.org>
Date: Thu, 1 Nov 2001 01:12:37 +0000
Subject: [PATCH] Fix query parser so it accepts queries with unicode
 characters

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149616 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/queryParser/FastCharStream.java    | 159 ++++++++++++++++++
 .../apache/lucene/queryParser/QueryParser.jj  |  65 ++++---
 .../lucene/queryParser/TestQueryParser.java   |   3 +
 3 files changed, 202 insertions(+), 25 deletions(-)
 create mode 100644 src/java/org/apache/lucene/queryParser/FastCharStream.java
diff --git a/src/java/org/apache/lucene/queryParser/FastCharStream.java b/src/java/org/apache/lucene/queryParser/FastCharStream.java
new file mode 100644
index 00000000000..0be6c173af3
--- /dev/null
+++ b/src/java/org/apache/lucene/queryParser/FastCharStream.java
@@ -0,0 +1,159 @@
+// FastCharStream.java
+package org.apache.lucene.queryParser;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.*;
+
+/** An efficient implementation of JavaCC's CharStream interface.  <p>Note that
+ * this does not do line-number counting, but instead keeps track of the
+ * character position of the token in the input, as required by Lucene's {@link
+ * org.apache.lucene.analysis.Token} API. */
+public final class FastCharStream implements CharStream {
+  char[] buffer = null;
+
+  int bufferLength = 0;				  // end of valid chars
+  int bufferPosition = 0;			  // next char to read
+  
+  int tokenStart = 0;				  // offset in buffer
+  int bufferStart = 0;				  // position in file of buffer
+
+  Reader input;					  // source of chars
+
+  /** Constructs from a Reader. */
+  public FastCharStream(Reader r) {
+    input = r;
+  }
+
+  public final char readChar() throws IOException {
+    if (bufferPosition >= bufferLength)
+      refill();
+    return buffer[bufferPosition++];
+  }
+
+  private final void refill() throws IOException {
+    int newPosition = bufferLength - tokenStart;
+
+    if (tokenStart == 0) {			  // token won't fit in buffer
+      if (buffer == null) {			  // first time: alloc buffer
+	buffer = new char[2048];		  
+      } else if (bufferLength == buffer.length) { // grow buffer
+	char[] newBuffer = new char[buffer.length*2];
+	System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
+	buffer = newBuffer;
+      }
+    } else {					  // shift token to front
+      System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
+    }
+
+    bufferLength = newPosition;			  // update state
+    bufferPosition = newPosition;
+    bufferStart += tokenStart;
+    tokenStart = 0;
+
+    int charsRead =				  // fill space in buffer
+      input.read(buffer, newPosition, buffer.length-newPosition);
+    if (charsRead == -1)
+      throw new IOException("read past eof");
+    else
+      bufferLength += charsRead;
+  }
+
+  public final char BeginToken() throws IOException {
+    tokenStart = bufferPosition;
+    return readChar();
+  }
+
+  public final void backup(int amount) {
+    bufferPosition -= amount;
+  }
+
+  public final String GetImage() {
+    return new String(buffer, tokenStart, bufferPosition - tokenStart);
+  }
+
+  public final char[] GetSuffix(int len) {
+    char[] value = new char[len];
+    System.arraycopy(buffer, bufferPosition - len, value, 0, len);
+    return value;
+  }
+
+  public final void Done() {
+    try {
+      input.close();
+    } catch (IOException e) {
+      System.err.println("Caught: " + e + "; ignoring.");
+    }
+  }
+
+  public final int getColumn() {
+    return bufferStart + bufferPosition;
+  }
+  public final int getLine() {
+    return 1;
+  }
+  public final int getEndColumn() {
+    return bufferStart + bufferPosition;
+  }
+  public final int getEndLine() {
+    return 1;
+  }
+  public final int getBeginColumn() {
+    return bufferStart + tokenStart;
+  }
+  public final int getBeginLine() {
+    return 1;
+  }
+}
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj
index ec87328b1c2..23168e57f67 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@@ -54,7 +54,9 @@
 
 
 options {
-  STATIC= false;
+  STATIC=false;
+  JAVA_UNICODE_ESCAPE=true;
+  USER_CHAR_STREAM=true;
 }
 
 PARSER_BEGIN(QueryParser)
@@ -94,6 +96,8 @@ import org.apache.lucene.search.*;
  *   Query  ::= ( Clause )*
  *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
  * </pre>
+ *
+ * @author Brian Goetz
  */
 
 public class QueryParser {
@@ -118,7 +122,7 @@ public class QueryParser {
    *  @param analyzer   used to find terms in the query text.
    */
   public QueryParser(String f, Analyzer a) {
-    this(new StringReader(""));
+    this(new FastCharStream(new StringReader("")));
     analyzer = a;
     field = f;
   }
@@ -128,7 +132,7 @@ public class QueryParser {
    *  @param query	the query string to be parsed.
    */
   public Query parse(String query) throws ParseException {
-    ReInit(new StringReader(query));
+    ReInit(new FastCharStream(new StringReader(query)));
     return Query(field);
   }
 
@@ -168,7 +172,8 @@ public class QueryParser {
     // Use the analyzer to get all the tokens, and then build a TermQuery,
     // PhraseQuery, or nothing based on the term count
     
-    TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
+    TokenStream source = analyzer.tokenStream(field, 
+                                              new StringReader(queryText));
     Vector v = new Vector();
     org.apache.lucene.analysis.Token t;
 
@@ -252,8 +257,8 @@ PARSER_END(QueryParser)
   <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
 | <#_NUM_CHAR:   ["0"-"9"] >
 | <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
-| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
-| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
+| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
+| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "\u0080"-"\uFFFE" ] >
 | <#_NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
 | <#_WHITESPACE: ( " " | "\t" ) >
 | <#_QCHAR:      ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
@@ -272,12 +277,11 @@ PARSER_END(QueryParser)
 | <CARAT:     "^" >
 | <STAR:      "*" >
 | <QUOTED:     "\"" (~["\""])+ "\"">
-| <NUMBER:    (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
-| <TERM:      <_IDENTIFIER_CHAR> 
-              ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~", "{", "}", "[", "]" ] )* >
+| <NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
+| <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
 | <FUZZY:     "~" >
-| <WILDTERM:  <_IDENTIFIER_CHAR>
-              ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~", "{", "}", "[", "]" ] )* <_IDENTIFIER_CHAR>>
+| <WILDTERM:  <_TERM_START_CHAR> 
+              (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
 | <RANGEIN:   "[" (~["]"])+ "]">
 | <RANGEEX:   "{" (~["}"])+ "}">
 }
@@ -363,23 +367,34 @@ Query Term(String field) : {
 }
 {
   ( 
-     (term=<TERM>|term=<WILDTERM>{wildcard=true;}|term=<NUMBER>)[<STAR>{prefix=true;}|<FUZZY>{fuzzy=true;}][<CARAT> boost=<NUMBER>]
-      { if (wildcard)
-          q = new WildcardQuery(new Term(field, term.image));
-        else if (prefix) 
-          q = new PrefixQuery(new Term(field, term.image));
-        else if (fuzzy)
-          q = new FuzzyQuery(new Term(field, term.image));
-        else
-          q = getFieldQuery(field, analyzer, term.image); }
-    | (term=<RANGEIN>{rangein=true;}|term=<RANGEEX>)
+     (
+       term=<TERM>
+       | term=<WILDTERM> { wildcard=true; }
+       | term=<NUMBER>
+     )
+     [ <STAR> { prefix=true; } | <FUZZY> { fuzzy=true; } ]
+     [ <CARAT> boost=<NUMBER> ]
+     { 
+       if (wildcard)
+         q = new WildcardQuery(new Term(field, term.image));
+       else if (prefix) 
+         q = new PrefixQuery(new Term(field, term.image));
+       else if (fuzzy)
+         q = new FuzzyQuery(new Term(field, term.image));
+       else
+         q = getFieldQuery(field, analyzer, term.image); 
+     }
+     | ( term=<RANGEIN> { rangein=true; } | term=<RANGEEX> )
         {
           q = getRangeQuery(field, analyzer, 
-                            term.image.substring(1, term.image.length()-1), rangein);
+                            term.image.substring(1, term.image.length()-1), 
+                            rangein);
         }
-    | term=<QUOTED> 
-      { q = getFieldQuery(field, analyzer, 
-                          term.image.substring(1, term.image.length()-1)); }
+     | term=<QUOTED> 
+       { 
+         q = getFieldQuery(field, analyzer, 
+                           term.image.substring(1, term.image.length()-1)); 
+       }
   )
   { 
     if (boost != null) {
diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
index fb66ff539fe..645815b174a 100644
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@@ -138,6 +138,8 @@ public class TestQueryParser extends TestCase {
 
   public void testSimple() throws Exception {
     assertQueryEquals("term term term", null, "term term term");
+    assertQueryEquals("türm term term", null, "türm term term");
+    assertQueryEquals("ümlaut", null, "ümlaut");
     assertQueryEquals("term term1 term2", null, "term term term");
     assertQueryEquals("term 1.0 1 2", null, "term");
 
@@ -163,6 +165,7 @@ public class TestQueryParser extends TestCase {
 
     assertQueryEquals("germ term^2.0", null, "germ term^2.0");
     assertQueryEquals("term^2.0", null, "term^2.0");
+    assertQueryEquals("term^2", null, "term^2.0");
 
     assertQueryEquals("(foo OR bar) AND (baz OR boo)", null, 
                       "+(foo bar) +(baz boo)");