Fix query parser (finally) to be much more lenient about queries that have funny characters; added new test cases to test new rules; added NullTokenizer/NullAnalyzer which just pass through space-separated tokens unmodified (mostly for testing purposes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149641 13f79535-47bb-0310-9956-ffa450edef68
2002-01-17 02:49:22 +00:00 · 2002-01-17 02:49:22 +00:00 · 1fa4fa82d1
parent ae45d392f8
commit 1fa4fa82d1
5 changed files with 232 additions and 9 deletions
--- a/src/java/org/apache/lucene/analysis/NullAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/NullAnalyzer.java
@ -0,0 +1,65 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+
+/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
+
+public final class NullAnalyzer extends Analyzer {
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    return new NullTokenizer(reader);
+  }
+}
--- a/src/java/org/apache/lucene/analysis/NullTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/NullTokenizer.java
@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+
+/** LowerCaseTokenizer performs the function of LetterTokenizer
+  and LowerCaseFilter together.  It divides text at non-letters and converts
+  them to lower case.  While it is functionally equivalent to the combination
+  of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+  to doing the two tasks at once, hence this (redundent) implementation.
+
+  Note: this does a decent job for most European languages, but does a terrible
+  job for some Asian languages, where words are not separated by spaces. */
+
+public final class NullTokenizer extends Tokenizer {
+  public NullTokenizer(Reader in) {
+    input = in;
+  }
+
+  private int offset = 0, bufferIndex=0, dataLen=0;
+  private final static int MAX_WORD_LEN = 255;
+  private final static int IO_BUFFER_SIZE = 1024;
+  private final char[] buffer = new char[MAX_WORD_LEN];
+  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+  public final Token next() throws java.io.IOException {
+    int length = 0;
+    int start = offset;
+    while (true) {
+      final char c;
+
+      offset++;
+      if (bufferIndex >= dataLen) {
+        dataLen = input.read(ioBuffer);
+        bufferIndex = 0;
+      };
+      if (dataLen == -1) {
+	if (length > 0)
+	  break;
+	else
+	  return null;
+      }
+      else
+        c = (char) ioBuffer[bufferIndex++];
+      
+      if (Character.isWhitespace(c)) {
+        if (length > 0)
+          break;
+        else
+          continue;
+      }
+
+      if (length == 0)			  // start of token
+        start = offset-1;
+
+      buffer[length++] = c;
+                                                  // buffer it
+      if (length == MAX_WORD_LEN)		  // buffer overflow!
+        break;
+    }
+
+    return new Token(new String(buffer, 0, length), start, start+length);
+  }
+}
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@ -261,11 +261,16 @@ PARSER_END(QueryParser)

 <*> TOKEN : {
  <#_NUM_CHAR:   ["0"-"9"] >
-| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
-| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
+| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", 
+                         "[", "]", "\"", "{", "}", "~", "*" ] >
+| <#_TERM_CHAR: <_TERM_START_CHAR> >
 | <#_WHITESPACE: ( " " | "\t" ) >
 }

+<DEFAULT> SKIP : {
+  <<_WHITESPACE>>
+}
+
 <DEFAULT> TOKEN : {
  <AND:       ("AND" | "&&") >
 | <OR:        ("OR" | "||") >
@ -275,9 +280,8 @@ PARSER_END(QueryParser)
 | <LPAREN:    "(" >
 | <RPAREN:    ")" >
 | <COLON:     ":" >
-| <CARAT:     "^" >
+| <CARAT:     "^" > : Boost
 | <QUOTED:     "\"" (~["\""])+ "\"">
-| <NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
 | <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
 | <FUZZY:     "~" >
 | <PREFIXTERM:  <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
@ -287,8 +291,8 @@ PARSER_END(QueryParser)
 | <RANGEEX:   "{" ( ~[ "}" ] )+ "}">
 }

-<DEFAULT> SKIP : {
-  <<_WHITESPACE>>
+<Boost> TOKEN : {
+<NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
 }

 // *   Query  ::= ( Clause )*
--- a/src/test/org/apache/lucene/analysis/TestAnalyzers.java
+++ b/src/test/org/apache/lucene/analysis/TestAnalyzers.java
@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase {
                     new String[] { "quoted", "word" });
  }

+  public void testNull() throws Exception {
+    Analyzer a = new NullAnalyzer();
+    assertAnalyzesTo(a, "foo bar FOO BAR", 
+                     new String[] { "foo", "bar", "FOO", "BAR" });
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
+                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
+    assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
+                     new String[] { "foo.bar.FOO.BAR" });
+    assertAnalyzesTo(a, "U.S.A.", 
+                     new String[] { "U.S.A." });
+    assertAnalyzesTo(a, "C++", 
+                     new String[] { "C++" });
+    assertAnalyzesTo(a, "B2B", 
+                     new String[] { "B2B" });
+    assertAnalyzesTo(a, "2B", 
+                     new String[] { "2B" });
+    assertAnalyzesTo(a, "\"QUOTED\" word", 
+                     new String[] { "\"QUOTED\"", "word" });
+  }
+
  public void testStop() throws Exception {
    Analyzer a = new StopAnalyzer();
    assertAnalyzesTo(a, "foo bar FOO BAR", 
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@ -61,6 +61,7 @@ import org.apache.lucene.*;
 import org.apache.lucene.queryParser.*;
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.standard.*;
 import org.apache.lucene.analysis.Token;

 public class TestQueryParser extends TestCase {
@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase {
    assertQueryEquals("term term term", null, "term term term");
    assertQueryEquals("türm term term", null, "türm term term");
    assertQueryEquals("ümlaut", null, "ümlaut");
-    assertQueryEquals("term term1 term2", null, "term term term");
-    assertQueryEquals("term 1.0 1 2", null, "term");

    assertQueryEquals("a AND b", null, "+a +b");
    assertQueryEquals("(a AND b)", null, "+a +b");
@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase {
    assertQueryEquals("a AND -b", null, "+a -b");
    assertQueryEquals("a AND !b", null, "+a -b");
    assertQueryEquals("a && b", null, "+a +b");
-    assertQueryEquals("a&&b", null, "+a +b");
    assertQueryEquals("a && ! b", null, "+a -b");

    assertQueryEquals("a OR b", null, "a b");
@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase {
                      "+(title:dog title:cat) -author:\"bob dole\"");
  }

+  public void testPunct() throws Exception {
+    Analyzer a = new NullAnalyzer();
+    assertQueryEquals("a&b", a, "a&b");
+    assertQueryEquals("a&&b", a, "a&&b");
+    assertQueryEquals(".NET", a, ".NET");
+  }
+
+  public void testNumber() throws Exception {
+    // The numbers go away because SimpleAnalzyer ignores them
+    assertQueryEquals("3", null, "");
+    assertQueryEquals("term 1.0 1 2", null, "term");
+    assertQueryEquals("term term1 term2", null, "term term term");
+
+    Analyzer a = new StandardAnalyzer();
+    assertQueryEquals("3", a, "3");
+    assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
+    assertQueryEquals("term term1 term2", a, "term term1 term2");
+  }
+
  public void testWildcard() throws Exception {
    assertQueryEquals("term*", null, "term*");
    assertQueryEquals("term*^2", null, "term*^2.0");