mirror of https://github.com/apache/lucene.git
Fix query parser so it accepts queries with unicode characters
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149616 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
992d1ade00
commit
584d2b394b
|
@ -0,0 +1,159 @@
|
||||||
|
// FastCharStream.java
|
||||||
|
package org.apache.lucene.queryParser;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/** An efficient implementation of JavaCC's CharStream interface. <p>Note that
|
||||||
|
* this does not do line-number counting, but instead keeps track of the
|
||||||
|
* character position of the token in the input, as required by Lucene's {@link
|
||||||
|
* org.apache.lucene.analysis.Token} API. */
|
||||||
|
public final class FastCharStream implements CharStream {
|
||||||
|
char[] buffer = null;
|
||||||
|
|
||||||
|
int bufferLength = 0; // end of valid chars
|
||||||
|
int bufferPosition = 0; // next char to read
|
||||||
|
|
||||||
|
int tokenStart = 0; // offset in buffer
|
||||||
|
int bufferStart = 0; // position in file of buffer
|
||||||
|
|
||||||
|
Reader input; // source of chars
|
||||||
|
|
||||||
|
/** Constructs from a Reader. */
|
||||||
|
public FastCharStream(Reader r) {
|
||||||
|
input = r;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char readChar() throws IOException {
|
||||||
|
if (bufferPosition >= bufferLength)
|
||||||
|
refill();
|
||||||
|
return buffer[bufferPosition++];
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void refill() throws IOException {
|
||||||
|
int newPosition = bufferLength - tokenStart;
|
||||||
|
|
||||||
|
if (tokenStart == 0) { // token won't fit in buffer
|
||||||
|
if (buffer == null) { // first time: alloc buffer
|
||||||
|
buffer = new char[2048];
|
||||||
|
} else if (bufferLength == buffer.length) { // grow buffer
|
||||||
|
char[] newBuffer = new char[buffer.length*2];
|
||||||
|
System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
|
||||||
|
buffer = newBuffer;
|
||||||
|
}
|
||||||
|
} else { // shift token to front
|
||||||
|
System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
|
||||||
|
}
|
||||||
|
|
||||||
|
bufferLength = newPosition; // update state
|
||||||
|
bufferPosition = newPosition;
|
||||||
|
bufferStart += tokenStart;
|
||||||
|
tokenStart = 0;
|
||||||
|
|
||||||
|
int charsRead = // fill space in buffer
|
||||||
|
input.read(buffer, newPosition, buffer.length-newPosition);
|
||||||
|
if (charsRead == -1)
|
||||||
|
throw new IOException("read past eof");
|
||||||
|
else
|
||||||
|
bufferLength += charsRead;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char BeginToken() throws IOException {
|
||||||
|
tokenStart = bufferPosition;
|
||||||
|
return readChar();
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void backup(int amount) {
|
||||||
|
bufferPosition -= amount;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final String GetImage() {
|
||||||
|
return new String(buffer, tokenStart, bufferPosition - tokenStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
public final char[] GetSuffix(int len) {
|
||||||
|
char[] value = new char[len];
|
||||||
|
System.arraycopy(buffer, bufferPosition - len, value, 0, len);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public final void Done() {
|
||||||
|
try {
|
||||||
|
input.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.err.println("Caught: " + e + "; ignoring.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public final int getColumn() {
|
||||||
|
return bufferStart + bufferPosition;
|
||||||
|
}
|
||||||
|
public final int getLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
public final int getEndColumn() {
|
||||||
|
return bufferStart + bufferPosition;
|
||||||
|
}
|
||||||
|
public final int getEndLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
public final int getBeginColumn() {
|
||||||
|
return bufferStart + tokenStart;
|
||||||
|
}
|
||||||
|
public final int getBeginLine() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -55,6 +55,8 @@
|
||||||
|
|
||||||
options {
|
options {
|
||||||
STATIC=false;
|
STATIC=false;
|
||||||
|
JAVA_UNICODE_ESCAPE=true;
|
||||||
|
USER_CHAR_STREAM=true;
|
||||||
}
|
}
|
||||||
|
|
||||||
PARSER_BEGIN(QueryParser)
|
PARSER_BEGIN(QueryParser)
|
||||||
|
@ -94,6 +96,8 @@ import org.apache.lucene.search.*;
|
||||||
* Query ::= ( Clause )*
|
* Query ::= ( Clause )*
|
||||||
* Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
|
* Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
|
||||||
* </pre>
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Brian Goetz
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class QueryParser {
|
public class QueryParser {
|
||||||
|
@ -118,7 +122,7 @@ public class QueryParser {
|
||||||
* @param analyzer used to find terms in the query text.
|
* @param analyzer used to find terms in the query text.
|
||||||
*/
|
*/
|
||||||
public QueryParser(String f, Analyzer a) {
|
public QueryParser(String f, Analyzer a) {
|
||||||
this(new StringReader(""));
|
this(new FastCharStream(new StringReader("")));
|
||||||
analyzer = a;
|
analyzer = a;
|
||||||
field = f;
|
field = f;
|
||||||
}
|
}
|
||||||
|
@ -128,7 +132,7 @@ public class QueryParser {
|
||||||
* @param query the query string to be parsed.
|
* @param query the query string to be parsed.
|
||||||
*/
|
*/
|
||||||
public Query parse(String query) throws ParseException {
|
public Query parse(String query) throws ParseException {
|
||||||
ReInit(new StringReader(query));
|
ReInit(new FastCharStream(new StringReader(query)));
|
||||||
return Query(field);
|
return Query(field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -168,7 +172,8 @@ public class QueryParser {
|
||||||
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
// Use the analyzer to get all the tokens, and then build a TermQuery,
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
TokenStream source = analyzer.tokenStream(field,
|
||||||
|
new StringReader(queryText));
|
||||||
Vector v = new Vector();
|
Vector v = new Vector();
|
||||||
org.apache.lucene.analysis.Token t;
|
org.apache.lucene.analysis.Token t;
|
||||||
|
|
||||||
|
@ -252,8 +257,8 @@ PARSER_END(QueryParser)
|
||||||
<#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
|
<#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
|
||||||
| <#_NUM_CHAR: ["0"-"9"] >
|
| <#_NUM_CHAR: ["0"-"9"] >
|
||||||
| <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
|
| <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
|
||||||
| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
|
| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
|
||||||
| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
|
| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "\u0080"-"\uFFFE" ] >
|
||||||
| <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) >
|
| <#_NEWLINE: ( "\r\n" | "\r" | "\n" ) >
|
||||||
| <#_WHITESPACE: ( " " | "\t" ) >
|
| <#_WHITESPACE: ( " " | "\t" ) >
|
||||||
| <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
|
| <#_QCHAR: ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
|
||||||
|
@ -272,12 +277,11 @@ PARSER_END(QueryParser)
|
||||||
| <CARAT: "^" >
|
| <CARAT: "^" >
|
||||||
| <STAR: "*" >
|
| <STAR: "*" >
|
||||||
| <QUOTED: "\"" (~["\""])+ "\"">
|
| <QUOTED: "\"" (~["\""])+ "\"">
|
||||||
| <NUMBER: (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
|
| <NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
|
||||||
| <TERM: <_IDENTIFIER_CHAR>
|
| <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
|
||||||
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~", "{", "}", "[", "]" ] )* >
|
|
||||||
| <FUZZY: "~" >
|
| <FUZZY: "~" >
|
||||||
| <WILDTERM: <_IDENTIFIER_CHAR>
|
| <WILDTERM: <_TERM_START_CHAR>
|
||||||
( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~", "{", "}", "[", "]" ] )* <_IDENTIFIER_CHAR>>
|
(<_TERM_CHAR> | ( [ "*", "?" ] ))* >
|
||||||
| <RANGEIN: "[" (~["]"])+ "]">
|
| <RANGEIN: "[" (~["]"])+ "]">
|
||||||
| <RANGEEX: "{" (~["}"])+ "}">
|
| <RANGEEX: "{" (~["}"])+ "}">
|
||||||
}
|
}
|
||||||
|
@ -363,23 +367,34 @@ Query Term(String field) : {
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
(
|
(
|
||||||
(term=<TERM>|term=<WILDTERM>{wildcard=true;}|term=<NUMBER>)[<STAR>{prefix=true;}|<FUZZY>{fuzzy=true;}][<CARAT> boost=<NUMBER>]
|
(
|
||||||
{ if (wildcard)
|
term=<TERM>
|
||||||
|
| term=<WILDTERM> { wildcard=true; }
|
||||||
|
| term=<NUMBER>
|
||||||
|
)
|
||||||
|
[ <STAR> { prefix=true; } | <FUZZY> { fuzzy=true; } ]
|
||||||
|
[ <CARAT> boost=<NUMBER> ]
|
||||||
|
{
|
||||||
|
if (wildcard)
|
||||||
q = new WildcardQuery(new Term(field, term.image));
|
q = new WildcardQuery(new Term(field, term.image));
|
||||||
else if (prefix)
|
else if (prefix)
|
||||||
q = new PrefixQuery(new Term(field, term.image));
|
q = new PrefixQuery(new Term(field, term.image));
|
||||||
else if (fuzzy)
|
else if (fuzzy)
|
||||||
q = new FuzzyQuery(new Term(field, term.image));
|
q = new FuzzyQuery(new Term(field, term.image));
|
||||||
else
|
else
|
||||||
q = getFieldQuery(field, analyzer, term.image); }
|
q = getFieldQuery(field, analyzer, term.image);
|
||||||
|
}
|
||||||
| ( term=<RANGEIN> { rangein=true; } | term=<RANGEEX> )
|
| ( term=<RANGEIN> { rangein=true; } | term=<RANGEEX> )
|
||||||
{
|
{
|
||||||
q = getRangeQuery(field, analyzer,
|
q = getRangeQuery(field, analyzer,
|
||||||
term.image.substring(1, term.image.length()-1), rangein);
|
term.image.substring(1, term.image.length()-1),
|
||||||
|
rangein);
|
||||||
}
|
}
|
||||||
| term=<QUOTED>
|
| term=<QUOTED>
|
||||||
{ q = getFieldQuery(field, analyzer,
|
{
|
||||||
term.image.substring(1, term.image.length()-1)); }
|
q = getFieldQuery(field, analyzer,
|
||||||
|
term.image.substring(1, term.image.length()-1));
|
||||||
|
}
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (boost != null) {
|
if (boost != null) {
|
||||||
|
|
|
@ -138,6 +138,8 @@ public class TestQueryParser extends TestCase {
|
||||||
|
|
||||||
public void testSimple() throws Exception {
|
public void testSimple() throws Exception {
|
||||||
assertQueryEquals("term term term", null, "term term term");
|
assertQueryEquals("term term term", null, "term term term");
|
||||||
|
assertQueryEquals("türm term term", null, "türm term term");
|
||||||
|
assertQueryEquals("ümlaut", null, "ümlaut");
|
||||||
assertQueryEquals("term term1 term2", null, "term term term");
|
assertQueryEquals("term term1 term2", null, "term term term");
|
||||||
assertQueryEquals("term 1.0 1 2", null, "term");
|
assertQueryEquals("term 1.0 1 2", null, "term");
|
||||||
|
|
||||||
|
@ -163,6 +165,7 @@ public class TestQueryParser extends TestCase {
|
||||||
|
|
||||||
assertQueryEquals("germ term^2.0", null, "germ term^2.0");
|
assertQueryEquals("germ term^2.0", null, "germ term^2.0");
|
||||||
assertQueryEquals("term^2.0", null, "term^2.0");
|
assertQueryEquals("term^2.0", null, "term^2.0");
|
||||||
|
assertQueryEquals("term^2", null, "term^2.0");
|
||||||
|
|
||||||
assertQueryEquals("(foo OR bar) AND (baz OR boo)", null,
|
assertQueryEquals("(foo OR bar) AND (baz OR boo)", null,
|
||||||
"+(foo bar) +(baz boo)");
|
"+(foo bar) +(baz boo)");
|
||||||
|
|
Loading…
Reference in New Issue