From 1fa4fa82d125974f235cb43e0cad886982850367 Mon Sep 17 00:00:00 2001 From: Brian Goetz Date: Thu, 17 Jan 2002 02:49:22 +0000 Subject: [PATCH] Fix query parser (finally) to be much more lenient about queries that have funny characters; added new test cases to test new rules; added NullTokenizer/NullAnalyzer which just pass through space-separated tokens unmodified (mostly for testing purposes git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149641 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/analysis/NullAnalyzer.java | 65 ++++++++++ .../apache/lucene/analysis/NullTokenizer.java | 117 ++++++++++++++++++ .../apache/lucene/queryParser/QueryParser.jj | 16 ++- .../apache/lucene/analysis/TestAnalyzers.java | 20 +++ .../lucene/queryParser/TestQueryParser.java | 23 +++- 5 files changed, 232 insertions(+), 9 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/NullAnalyzer.java create mode 100644 src/java/org/apache/lucene/analysis/NullTokenizer.java diff --git a/src/java/org/apache/lucene/analysis/NullAnalyzer.java b/src/java/org/apache/lucene/analysis/NullAnalyzer.java new file mode 100644 index 00000000000..3bf285e3fc1 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/NullAnalyzer.java @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.Reader; + +/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ + +public final class NullAnalyzer extends Analyzer { + public final TokenStream tokenStream(String fieldName, Reader reader) { + return new NullTokenizer(reader); + } +} diff --git a/src/java/org/apache/lucene/analysis/NullTokenizer.java b/src/java/org/apache/lucene/analysis/NullTokenizer.java new file mode 100644 index 00000000000..1795891a477 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/NullTokenizer.java @@ -0,0 +1,117 @@ +package org.apache.lucene.analysis; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.Reader; + +/** LowerCaseTokenizer performs the function of LetterTokenizer + and LowerCaseFilter together. It divides text at non-letters and converts + them to lower case. While it is functionally equivalent to the combination + of LetterTokenizer and LowerCaseFilter, there is a performance advantage + to doing the two tasks at once, hence this (redundent) implementation. + + Note: this does a decent job for most European languages, but does a terrible + job for some Asian languages, where words are not separated by spaces. */ + +public final class NullTokenizer extends Tokenizer { + public NullTokenizer(Reader in) { + input = in; + } + + private int offset = 0, bufferIndex=0, dataLen=0; + private final static int MAX_WORD_LEN = 255; + private final static int IO_BUFFER_SIZE = 1024; + private final char[] buffer = new char[MAX_WORD_LEN]; + private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + public final Token next() throws java.io.IOException { + int length = 0; + int start = offset; + while (true) { + final char c; + + offset++; + if (bufferIndex >= dataLen) { + dataLen = input.read(ioBuffer); + bufferIndex = 0; + }; + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } + else + c = (char) ioBuffer[bufferIndex++]; + + if (Character.isWhitespace(c)) { + if (length > 0) + break; + else + continue; + } + + if (length == 0) // start of token + start = offset-1; + + buffer[length++] = c; + // buffer it + if (length == MAX_WORD_LEN) // buffer overflow! + break; + } + + return new Token(new String(buffer, 0, length), start, start+length); + } +} diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj index 3f3a140f25a..cd2b6d1c286 100644 --- a/src/java/org/apache/lucene/queryParser/QueryParser.jj +++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj @@ -261,11 +261,16 @@ PARSER_END(QueryParser) <*> TOKEN : { <#_NUM_CHAR: ["0"-"9"] > -| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] > -| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] > +| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", + "[", "]", "\"", "{", "}", "~", "*" ] > +| <#_TERM_CHAR: <_TERM_START_CHAR> > | <#_WHITESPACE: ( " " | "\t" ) > } + SKIP : { + <<_WHITESPACE>> +} + TOKEN : { | @@ -275,9 +280,8 @@ PARSER_END(QueryParser) | | | -| +| : Boost | -| )+ ( "." (<_NUM_CHAR>)+ )? > | (<_TERM_CHAR>)* > | | (<_TERM_CHAR>)* "*" > @@ -287,8 +291,8 @@ PARSER_END(QueryParser) | } - SKIP : { - <<_WHITESPACE>> + TOKEN : { +)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } // * Query ::= ( Clause )* diff --git a/src/test/org/apache/lucene/analysis/TestAnalyzers.java b/src/test/org/apache/lucene/analysis/TestAnalyzers.java index 39ae9763226..19ce96efa0b 100644 --- a/src/test/org/apache/lucene/analysis/TestAnalyzers.java +++ b/src/test/org/apache/lucene/analysis/TestAnalyzers.java @@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase { new String[] { "quoted", "word" }); } + public void testNull() throws Exception { + Analyzer a = new NullAnalyzer(); + assertAnalyzesTo(a, "foo bar FOO BAR", + new String[] { "foo", "bar", "FOO", "BAR" }); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", + new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" }); + assertAnalyzesTo(a, "foo.bar.FOO.BAR", + new String[] { "foo.bar.FOO.BAR" }); + assertAnalyzesTo(a, "U.S.A.", + new String[] { "U.S.A." }); + assertAnalyzesTo(a, "C++", + new String[] { "C++" }); + assertAnalyzesTo(a, "B2B", + new String[] { "B2B" }); + assertAnalyzesTo(a, "2B", + new String[] { "2B" }); + assertAnalyzesTo(a, "\"QUOTED\" word", + new String[] { "\"QUOTED\"", "word" }); + } + public void testStop() throws Exception { Analyzer a = new StopAnalyzer(); assertAnalyzesTo(a, "foo bar FOO BAR", diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java index 1513bd3c76c..19e158ff1b5 100644 --- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java +++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java @@ -61,6 +61,7 @@ import org.apache.lucene.*; import org.apache.lucene.queryParser.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; +import org.apache.lucene.analysis.standard.*; import org.apache.lucene.analysis.Token; public class TestQueryParser extends TestCase { @@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase { assertQueryEquals("term term term", null, "term term term"); assertQueryEquals("türm term term", null, "türm term term"); assertQueryEquals("ümlaut", null, "ümlaut"); - assertQueryEquals("term term1 term2", null, "term term term"); - assertQueryEquals("term 1.0 1 2", null, "term"); assertQueryEquals("a AND b", null, "+a +b"); assertQueryEquals("(a AND b)", null, "+a +b"); @@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase { assertQueryEquals("a AND -b", null, "+a -b"); assertQueryEquals("a AND !b", null, "+a -b"); assertQueryEquals("a && b", null, "+a +b"); - assertQueryEquals("a&&b", null, "+a +b"); assertQueryEquals("a && ! b", null, "+a -b"); assertQueryEquals("a OR b", null, "a b"); @@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase { "+(title:dog title:cat) -author:\"bob dole\""); } + public void testPunct() throws Exception { + Analyzer a = new NullAnalyzer(); + assertQueryEquals("a&b", a, "a&b"); + assertQueryEquals("a&&b", a, "a&&b"); + assertQueryEquals(".NET", a, ".NET"); + } + + public void testNumber() throws Exception { + // The numbers go away because SimpleAnalzyer ignores them + assertQueryEquals("3", null, ""); + assertQueryEquals("term 1.0 1 2", null, "term"); + assertQueryEquals("term term1 term2", null, "term term term"); + + Analyzer a = new StandardAnalyzer(); + assertQueryEquals("3", a, "3"); + assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2"); + assertQueryEquals("term term1 term2", a, "term term1 term2"); + } + public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0");