Fix query parser (finally) to be much more lenient about queries that have funny characters; added new test cases to test new rules; added NullTokenizer/NullAnalyzer which just pass through space-separated tokens unmodified (mostly for testing purposes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149641 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Brian Goetz 2002-01-17 02:49:22 +00:00
parent ae45d392f8
commit 1fa4fa82d1
5 changed files with 232 additions and 9 deletions

View File

@ -0,0 +1,65 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
public final class NullAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new NullTokenizer(reader);
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** LowerCaseTokenizer performs the function of LetterTokenizer
and LowerCaseFilter together. It divides text at non-letters and converts
them to lower case. While it is functionally equivalent to the combination
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
to doing the two tasks at once, hence this (redundent) implementation.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class NullTokenizer extends Tokenizer {
public NullTokenizer(Reader in) {
input = in;
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isWhitespace(c)) {
if (length > 0)
break;
else
continue;
}
if (length == 0) // start of token
start = offset-1;
buffer[length++] = c;
// buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
}
return new Token(new String(buffer, 0, length), start, start+length);
}
}

View File

@ -261,11 +261,16 @@ PARSER_END(QueryParser)
<*> TOKEN : {
<#_NUM_CHAR: ["0"-"9"] >
| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^",
"[", "]", "\"", "{", "}", "~", "*" ] >
| <#_TERM_CHAR: <_TERM_START_CHAR> >
| <#_WHITESPACE: ( " " | "\t" ) >
}
<DEFAULT> SKIP : {
<<_WHITESPACE>>
}
<DEFAULT> TOKEN : {
<AND: ("AND" | "&&") >
| <OR: ("OR" | "||") >
@ -275,9 +280,8 @@ PARSER_END(QueryParser)
| <LPAREN: "(" >
| <RPAREN: ")" >
| <COLON: ":" >
| <CARAT: "^" >
| <CARAT: "^" > : Boost
| <QUOTED: "\"" (~["\""])+ "\"">
| <NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
| <TERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* >
| <FUZZY: "~" >
| <PREFIXTERM: <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
@ -287,8 +291,8 @@ PARSER_END(QueryParser)
| <RANGEEX: "{" ( ~[ "}" ] )+ "}">
}
<DEFAULT> SKIP : {
<<_WHITESPACE>>
<Boost> TOKEN : {
<NUMBER: (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}
// * Query ::= ( Clause )*

View File

@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase {
new String[] { "quoted", "word" });
}
public void testNull() throws Exception {
Analyzer a = new NullAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",
new String[] { "foo", "bar", "FOO", "BAR" });
assertAnalyzesTo(a, "foo bar . FOO <> BAR",
new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
assertAnalyzesTo(a, "foo.bar.FOO.BAR",
new String[] { "foo.bar.FOO.BAR" });
assertAnalyzesTo(a, "U.S.A.",
new String[] { "U.S.A." });
assertAnalyzesTo(a, "C++",
new String[] { "C++" });
assertAnalyzesTo(a, "B2B",
new String[] { "B2B" });
assertAnalyzesTo(a, "2B",
new String[] { "2B" });
assertAnalyzesTo(a, "\"QUOTED\" word",
new String[] { "\"QUOTED\"", "word" });
}
public void testStop() throws Exception {
Analyzer a = new StopAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",

View File

@ -61,6 +61,7 @@ import org.apache.lucene.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.Token;
public class TestQueryParser extends TestCase {
@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase {
assertQueryEquals("term term term", null, "term term term");
assertQueryEquals("türm term term", null, "türm term term");
assertQueryEquals("ümlaut", null, "ümlaut");
assertQueryEquals("term term1 term2", null, "term term term");
assertQueryEquals("term 1.0 1 2", null, "term");
assertQueryEquals("a AND b", null, "+a +b");
assertQueryEquals("(a AND b)", null, "+a +b");
@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase {
assertQueryEquals("a AND -b", null, "+a -b");
assertQueryEquals("a AND !b", null, "+a -b");
assertQueryEquals("a && b", null, "+a +b");
assertQueryEquals("a&&b", null, "+a +b");
assertQueryEquals("a && ! b", null, "+a -b");
assertQueryEquals("a OR b", null, "a b");
@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase {
"+(title:dog title:cat) -author:\"bob dole\"");
}
public void testPunct() throws Exception {
Analyzer a = new NullAnalyzer();
assertQueryEquals("a&b", a, "a&b");
assertQueryEquals("a&&b", a, "a&&b");
assertQueryEquals(".NET", a, ".NET");
}
public void testNumber() throws Exception {
// The numbers go away because SimpleAnalzyer ignores them
assertQueryEquals("3", null, "");
assertQueryEquals("term 1.0 1 2", null, "term");
assertQueryEquals("term term1 term2", null, "term term term");
Analyzer a = new StandardAnalyzer();
assertQueryEquals("3", a, "3");
assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
assertQueryEquals("term term1 term2", a, "term term1 term2");
}
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");