From 1fa4fa82d125974f235cb43e0cad886982850367 Mon Sep 17 00:00:00 2001
From: Brian Goetz <briangoetz@apache.org>
Date: Thu, 17 Jan 2002 02:49:22 +0000
Subject: [PATCH] Fix query parser (finally) to be much more lenient about
 queries that have funny characters; added new test cases to test new rules;
 added NullTokenizer/NullAnalyzer which just pass through space-separated
 tokens unmodified (mostly for testing purposes

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149641 13f79535-47bb-0310-9956-ffa450edef68
---
 .../apache/lucene/analysis/NullAnalyzer.java  |  65 ++++++++++
 .../apache/lucene/analysis/NullTokenizer.java | 117 ++++++++++++++++++
 .../apache/lucene/queryParser/QueryParser.jj  |  16 ++-
 .../apache/lucene/analysis/TestAnalyzers.java |  20 +++
 .../lucene/queryParser/TestQueryParser.java   |  23 +++-
 5 files changed, 232 insertions(+), 9 deletions(-)
 create mode 100644 src/java/org/apache/lucene/analysis/NullAnalyzer.java
 create mode 100644 src/java/org/apache/lucene/analysis/NullTokenizer.java

diff --git a/src/java/org/apache/lucene/analysis/NullAnalyzer.java b/src/java/org/apache/lucene/analysis/NullAnalyzer.java
new file mode 100644
index 00000000000..3bf285e3fc1
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/NullAnalyzer.java
@@ -0,0 +1,65 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+
+/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
+
+public final class NullAnalyzer extends Analyzer {
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    return new NullTokenizer(reader);
+  }
+}
diff --git a/src/java/org/apache/lucene/analysis/NullTokenizer.java b/src/java/org/apache/lucene/analysis/NullTokenizer.java
new file mode 100644
index 00000000000..1795891a477
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/NullTokenizer.java
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+
+/** LowerCaseTokenizer performs the function of LetterTokenizer
+  and LowerCaseFilter together.  It divides text at non-letters and converts
+  them to lower case.  While it is functionally equivalent to the combination
+  of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+  to doing the two tasks at once, hence this (redundent) implementation.
+
+  Note: this does a decent job for most European languages, but does a terrible
+  job for some Asian languages, where words are not separated by spaces. */
+
+public final class NullTokenizer extends Tokenizer {
+  public NullTokenizer(Reader in) {
+    input = in;
+  }
+
+  private int offset = 0, bufferIndex=0, dataLen=0;
+  private final static int MAX_WORD_LEN = 255;
+  private final static int IO_BUFFER_SIZE = 1024;
+  private final char[] buffer = new char[MAX_WORD_LEN];
+  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+  public final Token next() throws java.io.IOException {
+    int length = 0;
+    int start = offset;
+    while (true) {
+      final char c;
+
+      offset++;
+      if (bufferIndex >= dataLen) {
+        dataLen = input.read(ioBuffer);
+        bufferIndex = 0;
+      };
+      if (dataLen == -1) {
+	if (length > 0)
+	  break;
+	else
+	  return null;
+      }
+      else
+        c = (char) ioBuffer[bufferIndex++];
+      
+      if (Character.isWhitespace(c)) {
+        if (length > 0)
+          break;
+        else
+          continue;
+      }
+
+      if (length == 0)			  // start of token
+        start = offset-1;
+
+      buffer[length++] = c;
+                                                  // buffer it
+      if (length == MAX_WORD_LEN)		  // buffer overflow!
+        break;
+    }
+
+    return new Token(new String(buffer, 0, length), start, start+length);
+  }
+}
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj
index 3f3a140f25a..cd2b6d1c286 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@@ -261,11 +261,16 @@ PARSER_END(QueryParser)
 
 <*> TOKEN : {
   <#_NUM_CHAR:   ["0"-"9"] >
-| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
-| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
+| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", 
+                         "[", "]", "\"", "{", "}", "~", "*" ] >
+| <#_TERM_CHAR: <_TERM_START_CHAR> >
 | <#_WHITESPACE: ( " " | "\t" ) >
 }
 
+<DEFAULT> SKIP : {
+  <<_WHITESPACE>>
+}
+
 <DEFAULT> TOKEN : {
   <AND:       ("AND" | "&&") >
 | <OR:        ("OR" | "||") >
@@ -275,9 +280,8 @@ PARSER_END(QueryParser)
 | <LPAREN:    "(" >
 | <RPAREN:    ")" >
 | <COLON:     ":" >
-| <CARAT:     "^" >
+| <CARAT:     "^" > : Boost
 | <QUOTED:     "\"" (~["\""])+ "\"">
-| <NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
 | <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
 | <FUZZY:     "~" >
 | <PREFIXTERM:  <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
@@ -287,8 +291,8 @@ PARSER_END(QueryParser)
 | <RANGEEX:   "{" ( ~[ "}" ] )+ "}">
 }
 
-<DEFAULT> SKIP : {
-  <<_WHITESPACE>>
+<Boost> TOKEN : {
+<NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
 }
 
 // *   Query  ::= ( Clause )*
diff --git a/src/test/org/apache/lucene/analysis/TestAnalyzers.java b/src/test/org/apache/lucene/analysis/TestAnalyzers.java
index 39ae9763226..19ce96efa0b 100644
--- a/src/test/org/apache/lucene/analysis/TestAnalyzers.java
+++ b/src/test/org/apache/lucene/analysis/TestAnalyzers.java
@@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase {
                      new String[] { "quoted", "word" });
   }
 
+  public void testNull() throws Exception {
+    Analyzer a = new NullAnalyzer();
+    assertAnalyzesTo(a, "foo bar FOO BAR", 
+                     new String[] { "foo", "bar", "FOO", "BAR" });
+    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
+                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
+    assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
+                     new String[] { "foo.bar.FOO.BAR" });
+    assertAnalyzesTo(a, "U.S.A.", 
+                     new String[] { "U.S.A." });
+    assertAnalyzesTo(a, "C++", 
+                     new String[] { "C++" });
+    assertAnalyzesTo(a, "B2B", 
+                     new String[] { "B2B" });
+    assertAnalyzesTo(a, "2B", 
+                     new String[] { "2B" });
+    assertAnalyzesTo(a, "\"QUOTED\" word", 
+                     new String[] { "\"QUOTED\"", "word" });
+  }
+
   public void testStop() throws Exception {
     Analyzer a = new StopAnalyzer();
     assertAnalyzesTo(a, "foo bar FOO BAR", 
diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
index 1513bd3c76c..19e158ff1b5 100644
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@@ -61,6 +61,7 @@ import org.apache.lucene.*;
 import org.apache.lucene.queryParser.*;
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.standard.*;
 import org.apache.lucene.analysis.Token;
 
 public class TestQueryParser extends TestCase {
@@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase {
     assertQueryEquals("term term term", null, "term term term");
     assertQueryEquals("türm term term", null, "türm term term");
     assertQueryEquals("ümlaut", null, "ümlaut");
-    assertQueryEquals("term term1 term2", null, "term term term");
-    assertQueryEquals("term 1.0 1 2", null, "term");
 
     assertQueryEquals("a AND b", null, "+a +b");
     assertQueryEquals("(a AND b)", null, "+a +b");
@@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase {
     assertQueryEquals("a AND -b", null, "+a -b");
     assertQueryEquals("a AND !b", null, "+a -b");
     assertQueryEquals("a && b", null, "+a +b");
-    assertQueryEquals("a&&b", null, "+a +b");
     assertQueryEquals("a && ! b", null, "+a -b");
 
     assertQueryEquals("a OR b", null, "a b");
@@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase {
                       "+(title:dog title:cat) -author:\"bob dole\"");
   }
 
+  public void testPunct() throws Exception {
+    Analyzer a = new NullAnalyzer();
+    assertQueryEquals("a&b", a, "a&b");
+    assertQueryEquals("a&&b", a, "a&&b");
+    assertQueryEquals(".NET", a, ".NET");
+  }
+
+  public void testNumber() throws Exception {
+    // The numbers go away because SimpleAnalzyer ignores them
+    assertQueryEquals("3", null, "");
+    assertQueryEquals("term 1.0 1 2", null, "term");
+    assertQueryEquals("term term1 term2", null, "term term term");
+
+    Analyzer a = new StandardAnalyzer();
+    assertQueryEquals("3", a, "3");
+    assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
+    assertQueryEquals("term term1 term2", a, "term term1 term2");
+  }
+
   public void testWildcard() throws Exception {
     assertQueryEquals("term*", null, "term*");
     assertQueryEquals("term*^2", null, "term*^2.0");