diff --git a/src/java/org/apache/lucene/analysis/NullAnalyzer.java b/src/java/org/apache/lucene/analysis/NullAnalyzer.java
new file mode 100644
index 00000000000..3bf285e3fc1
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/NullAnalyzer.java
@@ -0,0 +1,65 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ */
+
+import java.io.Reader;
+
+/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
+
+public final class NullAnalyzer extends Analyzer {
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ return new NullTokenizer(reader);
+ }
+}
diff --git a/src/java/org/apache/lucene/analysis/NullTokenizer.java b/src/java/org/apache/lucene/analysis/NullTokenizer.java
new file mode 100644
index 00000000000..1795891a477
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/NullTokenizer.java
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ * if any, must include the following acknowledgment:
+ * "This product includes software developed by the
+ * Apache Software Foundation (http://www.apache.org/)."
+ * Alternately, this acknowledgment may appear in the software itself,
+ * if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ * "Apache Lucene" must not be used to endorse or promote products
+ * derived from this software without prior written permission. For
+ * written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ * "Apache Lucene", nor may "Apache" appear in their name, without
+ * prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation. For more
+ * information on the Apache Software Foundation, please see
+ * .
+ */
+
+import java.io.Reader;
+
+/** LowerCaseTokenizer performs the function of LetterTokenizer
+ and LowerCaseFilter together. It divides text at non-letters and converts
+ them to lower case. While it is functionally equivalent to the combination
+ of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+ to doing the two tasks at once, hence this (redundent) implementation.
+
+ Note: this does a decent job for most European languages, but does a terrible
+ job for some Asian languages, where words are not separated by spaces. */
+
+public final class NullTokenizer extends Tokenizer {
+ public NullTokenizer(Reader in) {
+ input = in;
+ }
+
+ private int offset = 0, bufferIndex=0, dataLen=0;
+ private final static int MAX_WORD_LEN = 255;
+ private final static int IO_BUFFER_SIZE = 1024;
+ private final char[] buffer = new char[MAX_WORD_LEN];
+ private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ public final Token next() throws java.io.IOException {
+ int length = 0;
+ int start = offset;
+ while (true) {
+ final char c;
+
+ offset++;
+ if (bufferIndex >= dataLen) {
+ dataLen = input.read(ioBuffer);
+ bufferIndex = 0;
+ };
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return null;
+ }
+ else
+ c = (char) ioBuffer[bufferIndex++];
+
+ if (Character.isWhitespace(c)) {
+ if (length > 0)
+ break;
+ else
+ continue;
+ }
+
+ if (length == 0) // start of token
+ start = offset-1;
+
+ buffer[length++] = c;
+ // buffer it
+ if (length == MAX_WORD_LEN) // buffer overflow!
+ break;
+ }
+
+ return new Token(new String(buffer, 0, length), start, start+length);
+ }
+}
diff --git a/src/java/org/apache/lucene/queryParser/QueryParser.jj b/src/java/org/apache/lucene/queryParser/QueryParser.jj
index 3f3a140f25a..cd2b6d1c286 100644
--- a/src/java/org/apache/lucene/queryParser/QueryParser.jj
+++ b/src/java/org/apache/lucene/queryParser/QueryParser.jj
@@ -261,11 +261,16 @@ PARSER_END(QueryParser)
<*> TOKEN : {
<#_NUM_CHAR: ["0"-"9"] >
-| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
-| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
+| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^",
+ "[", "]", "\"", "{", "}", "~", "*" ] >
+| <#_TERM_CHAR: <_TERM_START_CHAR> >
| <#_WHITESPACE: ( " " | "\t" ) >
}
+ SKIP : {
+ <<_WHITESPACE>>
+}
+
TOKEN : {
|
@@ -275,9 +280,8 @@ PARSER_END(QueryParser)
|
|
|
-|
+| : Boost
|
-| )+ ( "." (<_NUM_CHAR>)+ )? >
| (<_TERM_CHAR>)* >
|
| (<_TERM_CHAR>)* "*" >
@@ -287,8 +291,8 @@ PARSER_END(QueryParser)
|
}
- SKIP : {
- <<_WHITESPACE>>
+ TOKEN : {
+)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
}
// * Query ::= ( Clause )*
diff --git a/src/test/org/apache/lucene/analysis/TestAnalyzers.java b/src/test/org/apache/lucene/analysis/TestAnalyzers.java
index 39ae9763226..19ce96efa0b 100644
--- a/src/test/org/apache/lucene/analysis/TestAnalyzers.java
+++ b/src/test/org/apache/lucene/analysis/TestAnalyzers.java
@@ -100,6 +100,26 @@ public class TestAnalyzers extends TestCase {
new String[] { "quoted", "word" });
}
+ public void testNull() throws Exception {
+ Analyzer a = new NullAnalyzer();
+ assertAnalyzesTo(a, "foo bar FOO BAR",
+ new String[] { "foo", "bar", "FOO", "BAR" });
+ assertAnalyzesTo(a, "foo bar . FOO <> BAR",
+ new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
+ assertAnalyzesTo(a, "foo.bar.FOO.BAR",
+ new String[] { "foo.bar.FOO.BAR" });
+ assertAnalyzesTo(a, "U.S.A.",
+ new String[] { "U.S.A." });
+ assertAnalyzesTo(a, "C++",
+ new String[] { "C++" });
+ assertAnalyzesTo(a, "B2B",
+ new String[] { "B2B" });
+ assertAnalyzesTo(a, "2B",
+ new String[] { "2B" });
+ assertAnalyzesTo(a, "\"QUOTED\" word",
+ new String[] { "\"QUOTED\"", "word" });
+ }
+
public void testStop() throws Exception {
Analyzer a = new StopAnalyzer();
assertAnalyzesTo(a, "foo bar FOO BAR",
diff --git a/src/test/org/apache/lucene/queryParser/TestQueryParser.java b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
index 1513bd3c76c..19e158ff1b5 100644
--- a/src/test/org/apache/lucene/queryParser/TestQueryParser.java
+++ b/src/test/org/apache/lucene/queryParser/TestQueryParser.java
@@ -61,6 +61,7 @@ import org.apache.lucene.*;
import org.apache.lucene.queryParser.*;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.analysis.Token;
public class TestQueryParser extends TestCase {
@@ -135,8 +136,6 @@ public class TestQueryParser extends TestCase {
assertQueryEquals("term term term", null, "term term term");
assertQueryEquals("türm term term", null, "türm term term");
assertQueryEquals("ümlaut", null, "ümlaut");
- assertQueryEquals("term term1 term2", null, "term term term");
- assertQueryEquals("term 1.0 1 2", null, "term");
assertQueryEquals("a AND b", null, "+a +b");
assertQueryEquals("(a AND b)", null, "+a +b");
@@ -145,7 +144,6 @@ public class TestQueryParser extends TestCase {
assertQueryEquals("a AND -b", null, "+a -b");
assertQueryEquals("a AND !b", null, "+a -b");
assertQueryEquals("a && b", null, "+a +b");
- assertQueryEquals("a&&b", null, "+a +b");
assertQueryEquals("a && ! b", null, "+a -b");
assertQueryEquals("a OR b", null, "a b");
@@ -179,6 +177,25 @@ public class TestQueryParser extends TestCase {
"+(title:dog title:cat) -author:\"bob dole\"");
}
+ public void testPunct() throws Exception {
+ Analyzer a = new NullAnalyzer();
+ assertQueryEquals("a&b", a, "a&b");
+ assertQueryEquals("a&&b", a, "a&&b");
+ assertQueryEquals(".NET", a, ".NET");
+ }
+
+ public void testNumber() throws Exception {
+ // The numbers go away because SimpleAnalzyer ignores them
+ assertQueryEquals("3", null, "");
+ assertQueryEquals("term 1.0 1 2", null, "term");
+ assertQueryEquals("term term1 term2", null, "term term term");
+
+ Analyzer a = new StandardAnalyzer();
+ assertQueryEquals("3", a, "3");
+ assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
+ assertQueryEquals("term term1 term2", a, "term term1 term2");
+ }
+
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");