mirror of https://github.com/apache/lucene.git
Fix query parser (finally) to be much more lenient about queries that have funny characters; added new test cases to test new rules; added NullTokenizer/NullAnalyzer which just pass through space-separated tokens unmodified (mostly for testing purposes
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149641 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ae45d392f8
commit
1fa4fa82d1
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
|
||||
|
||||
public final class NullAnalyzer extends Analyzer {
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new NullTokenizer(reader);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
them to lower case. While it is functionally equivalent to the combination
|
||||
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||
to doing the two tasks at once, hence this (redundent) implementation.
|
||||
|
||||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class NullTokenizer extends Tokenizer {
|
||||
public NullTokenizer(Reader in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
while (true) {
|
||||
final char c;
|
||||
|
||||
offset++;
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isWhitespace(c)) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = c;
|
||||
// buffer it
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
}
|
||||
}
|
|
@ -261,11 +261,16 @@ PARSER_END(QueryParser)
|
|||
|
||||
<*> TOKEN : {
|
||||
<#_NUM_CHAR: ["0"-"9"] >
|
||||
| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
|
||||
| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
|
||||
| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^",
|
||||
"[", "]", "\"", "{", "}", "~", "*" ] >
|
||||
| <#_TERM_CHAR: <_TERM_START_CHAR> >
|
||||
| <#_WHITESPACE: ( " " | "\t" ) >
|
||||
}
|
||||
|
||||
<DEFAULT> SKIP : {
|
||||
<<_WHITESPACE>>
|
||||
}
|
||||
|
||||
<DEFAULT> TOKEN : {
|
||||
<AND: ("AND" | "&&") >
|
||||
| <OR: ("OR" | "||") >
|
||||
|
@ -275,9 +280,8 @@ PARSER_END(QueryParser)
|
|||
| <LPAREN: "(" >
|
||||
| <RPAREN: ")" >
|
||||
| <COLON: ":" >
|
||||
| <CARAT: "^" >
|
||||
| <CARAT: "^" > : Boost
|
||||
| <QUOTED: "\"" (~["\""])+ "\"">
|
||||
| <NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
|
||||
| <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
|
||||
| <FUZZY: "~" >
|
||||
| <PREFIXTERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
|
||||
|
@ -287,8 +291,8 @@ PARSER_END(QueryParser)
|
|||
| <RANGEEX: "{" ( ~[ "}" ] )+ "}">
|
||||
}
|
||||
|
||||
<DEFAULT> SKIP : {
|
||||
<<_WHITESPACE>>
|
||||
<Boost> TOKEN : {
|
||||
<NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
|
||||
}
|
||||
|
||||
// * Query ::= ( Clause )*
|
||||
|
|
|
@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase {
|
|||
new String[] { "quoted", "word" });
|
||||
}
|
||||
|
||||
public void testNull() throws Exception {
|
||||
Analyzer a = new NullAnalyzer();
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR",
|
||||
new String[] { "foo", "bar", "FOO", "BAR" });
|
||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
|
||||
new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
|
||||
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
|
||||
new String[] { "foo.bar.FOO.BAR" });
|
||||
assertAnalyzesTo(a, "U.S.A.",
|
||||
new String[] { "U.S.A." });
|
||||
assertAnalyzesTo(a, "C++",
|
||||
new String[] { "C++" });
|
||||
assertAnalyzesTo(a, "B2B",
|
||||
new String[] { "B2B" });
|
||||
assertAnalyzesTo(a, "2B",
|
||||
new String[] { "2B" });
|
||||
assertAnalyzesTo(a, "\"QUOTED\" word",
|
||||
new String[] { "\"QUOTED\"", "word" });
|
||||
}
|
||||
|
||||
public void testStop() throws Exception {
|
||||
Analyzer a = new StopAnalyzer();
|
||||
assertAnalyzesTo(a, "foo bar FOO BAR",
|
||||
|
|
|
@ -61,6 +61,7 @@ import org.apache.lucene.*;
|
|||
import org.apache.lucene.queryParser.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
public class TestQueryParser extends TestCase {
|
||||
|
@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase {
|
|||
assertQueryEquals("term term term", null, "term term term");
|
||||
assertQueryEquals("türm term term", null, "türm term term");
|
||||
assertQueryEquals("ümlaut", null, "ümlaut");
|
||||
assertQueryEquals("term term1 term2", null, "term term term");
|
||||
assertQueryEquals("term 1.0 1 2", null, "term");
|
||||
|
||||
assertQueryEquals("a AND b", null, "+a +b");
|
||||
assertQueryEquals("(a AND b)", null, "+a +b");
|
||||
|
@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase {
|
|||
assertQueryEquals("a AND -b", null, "+a -b");
|
||||
assertQueryEquals("a AND !b", null, "+a -b");
|
||||
assertQueryEquals("a && b", null, "+a +b");
|
||||
assertQueryEquals("a&&b", null, "+a +b");
|
||||
assertQueryEquals("a && ! b", null, "+a -b");
|
||||
|
||||
assertQueryEquals("a OR b", null, "a b");
|
||||
|
@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase {
|
|||
"+(title:dog title:cat) -author:\"bob dole\"");
|
||||
}
|
||||
|
||||
public void testPunct() throws Exception {
|
||||
Analyzer a = new NullAnalyzer();
|
||||
assertQueryEquals("a&b", a, "a&b");
|
||||
assertQueryEquals("a&&b", a, "a&&b");
|
||||
assertQueryEquals(".NET", a, ".NET");
|
||||
}
|
||||
|
||||
public void testNumber() throws Exception {
|
||||
// The numbers go away because SimpleAnalzyer ignores them
|
||||
assertQueryEquals("3", null, "");
|
||||
assertQueryEquals("term 1.0 1 2", null, "term");
|
||||
assertQueryEquals("term term1 term2", null, "term term term");
|
||||
|
||||
Analyzer a = new StandardAnalyzer();
|
||||
assertQueryEquals("3", a, "3");
|
||||
assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
|
||||
assertQueryEquals("term term1 term2", a, "term term1 term2");
|
||||
}
|
||||
|
||||
public void testWildcard() throws Exception {
|
||||
assertQueryEquals("term*", null, "term*");
|
||||
assertQueryEquals("term*^2", null, "term*^2.0");
|
||||
|
|
Loading…
Reference in New Issue