From 584d2b394b02f65934554fe2cb9dfec556f32ffa Mon Sep 17 00:00:00 2001 From: Brian Goetz Date: Thu, 1 Nov 2001 01:12:37 +0000 Subject: [PATCH] Fix query parser so it accepts queries with unicode characters git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149616 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/queryParser/FastCharStream.java | 159 ++++++++++++++++++ .../apache/lucene/queryParser/QueryParser.jj | 65 ++++--- .../lucene/queryParser/TestQueryParser.java | 3 + 3 files changed, 202 insertions(+), 25 deletions(-) create mode 100644 src/java/org/apache/lucene/queryParser/FastCharStream.java diff --git a/src/java/org/apache/lucene/queryParser/FastCharStream.java b/src/java/org/apache/lucene/queryParser/FastCharStream.java new file mode 100644 index 00000000000..0be6c173af3 --- /dev/null +++ b/src/java/org/apache/lucene/queryParser/FastCharStream.java @@ -0,0 +1,159 @@ +// FastCharStream.java +package org.apache.lucene.queryParser; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.*; + +/** An efficient implementation of JavaCC's CharStream interface.

Note that + * this does not do line-number counting, but instead keeps track of the + * character position of the token in the input, as required by Lucene's {@link + * org.apache.lucene.analysis.Token} API. */ +public final class FastCharStream implements CharStream { + char[] buffer = null; + + int bufferLength = 0; // end of valid chars + int bufferPosition = 0; // next char to read + + int tokenStart = 0; // offset in buffer + int bufferStart = 0; // position in file of buffer + + Reader input; // source of chars + + /** Constructs from a Reader. */ + public FastCharStream(Reader r) { + input = r; + } + + public final char readChar() throws IOException { + if (bufferPosition >= bufferLength) + refill(); + return buffer[bufferPosition++]; + } + + private final void refill() throws IOException { + int newPosition = bufferLength - tokenStart; + + if (tokenStart == 0) { // token won't fit in buffer + if (buffer == null) { // first time: alloc buffer + buffer = new char[2048]; + } else if (bufferLength == buffer.length) { // grow buffer + char[] newBuffer = new char[buffer.length*2]; + System.arraycopy(buffer, 0, newBuffer, 0, bufferLength); + buffer = newBuffer; + } + } else { // shift token to front + System.arraycopy(buffer, tokenStart, buffer, 0, newPosition); + } + + bufferLength = newPosition; // update state + bufferPosition = newPosition; + bufferStart += tokenStart; + tokenStart = 0; + + int charsRead = // fill space in buffer + input.read(buffer, newPosition, buffer.length-newPosition); + if (charsRead == -1) + throw new IOException("read past eof"); + else + bufferLength += charsRead; + } + + public final char BeginToken() throws IOException { + tokenStart = bufferPosition; + return readChar(); + } + + public final void backup(int amount) { + bufferPosition -= amount; + } + + public final String GetImage() { + return new String(buffer, tokenStart, bufferPosition - tokenStart); + } + + public final char[] GetSuffix(int len) { + char[] value = new char[len]; + System.arraycopy(buffer, bufferPosition - len, value, 0, len); + return value; + } + + public final void Done() { + try { + input.close(); + } catch (IOException e) { + System.err.println("Caught: " + e + "; ignoring."); + } + } + + public final int getColumn() { + return bufferStart + bufferPosition; + } + public final int getLine() { + return 1; + } + public final int getEndColumn() { + return bufferStart + bufferPosition; + } + public final int getEndLine() { + return 1; + } + public final int getBeginColumn() { + return bufferStart + tokenStart; + } + public final int getBeginLine() { + return 1; + } +} diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj index ec87328b1c2..23168e57f67 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -54,7 +54,9 @@ options { - STATIC= false; + STATIC=false; + JAVA_UNICODE_ESCAPE=true; + USER_CHAR_STREAM=true; } PARSER_BEGIN(QueryParser) @@ -94,6 +96,8 @@ import org.apache.lucene.search.*; * Query ::= ( Clause )* * Clause ::= ["+", "-"] [ ":"] ( | "(" Query ")" ) * + * + * @author Brian Goetz */ public class QueryParser { @@ -118,7 +122,7 @@ public class QueryParser { * @param analyzer used to find terms in the query text. */ public QueryParser(String f, Analyzer a) { - this(new StringReader("")); + this(new FastCharStream(new StringReader(""))); analyzer = a; field = f; } @@ -128,7 +132,7 @@ public class QueryParser { * @param query the query string to be parsed. */ public Query parse(String query) throws ParseException { - ReInit(new StringReader(query)); + ReInit(new FastCharStream(new StringReader(query))); return Query(field); } @@ -168,7 +172,8 @@ public class QueryParser { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count - TokenStream source = analyzer.tokenStream(field, new StringReader(queryText)); + TokenStream source = analyzer.tokenStream(field, + new StringReader(queryText)); Vector v = new Vector(); org.apache.lucene.analysis.Token t; @@ -252,8 +257,8 @@ PARSER_END(QueryParser) <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] > | <#_NUM_CHAR: ["0"-"9"] > | <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] > -| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] > -| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* > +| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] > +| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "\u0080"-"\uFFFE" ] > | <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) > | <#_WHITESPACE: ( " " | "\t" ) > | <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) > @@ -272,12 +277,11 @@ PARSER_END(QueryParser) | | | -| )+ "." (<_NUM_CHAR>)+ > -| - ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~", "{", "}", "[", "]" ] )* > +| )+ ( "." (<_NUM_CHAR>)+ )? > +| (<_TERM_CHAR>)* > | -| - ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~", "{", "}", "[", "]" ] )* <_IDENTIFIER_CHAR>> +| + (<_TERM_CHAR> | ( [ "*", "?" ] ))* > | | } @@ -363,23 +367,34 @@ Query Term(String field) : { } { ( - (term=|term={wildcard=true;}|term=)[{prefix=true;}|{fuzzy=true;}][ boost=] - { if (wildcard) - q = new WildcardQuery(new Term(field, term.image)); - else if (prefix) - q = new PrefixQuery(new Term(field, term.image)); - else if (fuzzy) - q = new FuzzyQuery(new Term(field, term.image)); - else - q = getFieldQuery(field, analyzer, term.image); } - | (term={rangein=true;}|term=) + ( + term= + | term= { wildcard=true; } + | term= + ) + [ { prefix=true; } | { fuzzy=true; } ] + [ boost= ] + { + if (wildcard) + q = new WildcardQuery(new Term(field, term.image)); + else if (prefix) + q = new PrefixQuery(new Term(field, term.image)); + else if (fuzzy) + q = new FuzzyQuery(new Term(field, term.image)); + else + q = getFieldQuery(field, analyzer, term.image); + } + | ( term= { rangein=true; } | term= ) { q = getRangeQuery(field, analyzer, - term.image.substring(1, term.image.length()-1), rangein); + term.image.substring(1, term.image.length()-1), + rangein); } - | term= - { q = getFieldQuery(field, analyzer, - term.image.substring(1, term.image.length()-1)); } + | term= + { + q = getFieldQuery(field, analyzer, + term.image.substring(1, term.image.length()-1)); + } ) { if (boost != null) { diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java index fb66ff539fe..645815b174a 100644 --- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java @@ -138,6 +138,8 @@ public class TestQueryParser extends TestCase { public void testSimple() throws Exception { assertQueryEquals("term term term", null, "term term term"); + assertQueryEquals("türm term term", null, "türm term term"); + assertQueryEquals("ümlaut", null, "ümlaut"); assertQueryEquals("term term1 term2", null, "term term term"); assertQueryEquals("term 1.0 1 2", null, "term"); @@ -163,6 +165,7 @@ public class TestQueryParser extends TestCase { assertQueryEquals("germ term^2.0", null, "germ term^2.0"); assertQueryEquals("term^2.0", null, "term^2.0"); + assertQueryEquals("term^2", null, "term^2.0"); assertQueryEquals("(foo OR bar) AND (baz OR boo)", null, "+(foo bar) +(baz boo)");