LUCENE-5336: add SimpleQueryParser for human-entered queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1541151 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2013-11-12 17:02:51 +00:00
parent 85a8991beb
commit 3c9e753df6
4 changed files with 1104 additions and 0 deletions

View File

@ -61,6 +61,13 @@ Optimizations
on Windows if NIOFSDirectory is used, mmapped files are still locked.
(Michael Poindexter, Robert Muir, Uwe Schindler)
======================= Lucene 4.7.0 =======================
New Features
* LUCENE-5336: Add SimpleQueryParser: parser for human-entered queries.
(Jack Conradson via Robert Muir)
======================= Lucene 4.6.0 =======================
New Features

View File

@ -0,0 +1,517 @@
package org.apache.lucene.queryparser.simple;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.QueryBuilder;
import java.util.Collections;
import java.util.Map;
/**
* SimpleQueryParser is used to parse human readable query syntax.
* <p>
* The main idea behind this parser is that a person should be able to type
* whatever they want to represent a query, and this parser will do its best
* to interpret what to search for no matter how poorly composed the request
* may be. Tokens are considered to be any of a term, phrase, or subquery for the
* operations described below. Whitespace including ' ' '\n' '\r' and '\t'
* and certain operators may be used to delimit tokens ( ) + | " .
* <p>
* Any errors in query syntax will be ignored and the parser will attempt
* to decipher what it can; however, this may mean odd or unexpected results.
* <h4>Query Operators</h4>
* <ul>
* <li>'{@code +}' specifies {@code AND} operation: <tt>token1+token2</tt>
* <li>'{@code |}' specifies {@code OR} operation: <tt>token1|token2</tt>
* <li>'{@code -}' negates a single token: <tt>-token0</tt>
* <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
* <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
* <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
* </ul>
* <p>
* The {@link #setDefaultOperator default operator} is {@code OR} if no other operator is specified.
* For example, the following will {@code OR} {@code token1} and {@code token2} together:
* <tt>token1 token2</tt>
* <p>
* Normal operator precedence will be simple order from right to left.
* For example, the following will evaluate {@code token1 OR token2} first,
* then {@code AND} with {@code token3}:
* <blockquote>token1 | token2 + token3</blockquote>
* <h4>Escaping</h4>
* <p>
* An individual term may contain any possible character with certain characters
* requiring escaping using a '{@code \}'. The following characters will need to be escaped in
* terms and phrases:
* {@code + | " ( ) ' \}
* <p>
* The '{@code -}' operator is a special case. On individual terms (not phrases) the first
* character of a term that is {@code -} must be escaped; however, any '{@code -}' characters
* beyond the first character do not need to be escaped.
* For example:
* <ul>
* <li>{@code -term1} -- Specifies {@code NOT} operation against {@code term1}
* <li>{@code \-term1} -- Searches for the term {@code -term1}.
* <li>{@code term-1} -- Searches for the term {@code term-1}.
* <li>{@code term\-1} -- Searches for the term {@code term-1}.
* </ul>
* <p>
* The '{@code *}' operator is a special case. On individual terms (not phrases) the last
* character of a term that is '{@code *}' must be escaped; however, any '{@code *}' characters
* before the last character do not need to be escaped:
* <ul>
* <li>{@code term1*} -- Searches for the prefix {@code term1}
* <li>{@code term1\*} -- Searches for the term {@code term1*}
* <li>{@code term*1} -- Searches for the term {@code term*1}
* <li>{@code term\*1} -- Searches for the term {@code term*1}
* </ul>
* <p>
* Note that above examples consider the terms before text processing.
*/
public class SimpleQueryParser extends QueryBuilder {
/** Map of fields to query against with their weights */
protected final Map<String,Float> weights;
/** flags to the parser (to turn features on/off) */
protected final int flags;
/** Enables {@code AND} operator (+) */
public static final int AND_OPERATOR = 1<<0;
/** Enables {@code NOT} operator (-) */
public static final int NOT_OPERATOR = 1<<1;
/** Enables {@code OR} operator (|) */
public static final int OR_OPERATOR = 1<<2;
/** Enables {@code PREFIX} operator (*) */
public static final int PREFIX_OPERATOR = 1<<3;
/** Enables {@code PHRASE} operator (") */
public static final int PHRASE_OPERATOR = 1<<4;
/** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */
public static final int PRECEDENCE_OPERATORS = 1<<5;
/** Enables {@code ESCAPE} operator (\) */
public static final int ESCAPE_OPERATOR = 1<<6;
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
public static final int WHITESPACE_OPERATOR = 1<<7;
private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
/** Creates a new parser searching over a single field. */
public SimpleQueryParser(Analyzer analyzer, String field) {
this(analyzer, Collections.singletonMap(field, 1.0F));
}
/** Creates a new parser searching over multiple fields with different weights. */
public SimpleQueryParser(Analyzer analyzer, Map<String, Float> weights) {
this(analyzer, weights, -1);
}
/** Creates a new parser with custom flags used to enable/disable certain features. */
public SimpleQueryParser(Analyzer analyzer, Map<String, Float> weights, int flags) {
super(analyzer);
this.weights = weights;
this.flags = flags;
}
/** Parses the query text and returns parsed query (or null if empty) */
public Query parse(String queryText) {
char data[] = queryText.toCharArray();
char buffer[] = new char[data.length];
State state = new State(data, buffer, 0, data.length);
parseSubQuery(state);
return state.top;
}
private void parseSubQuery(State state) {
while (state.index < state.length) {
if (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0) {
// the beginning of a subquery has been found
consumeSubQuery(state);
} else if (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0) {
// this is an extraneous character so it is ignored
++state.index;
} else if (state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0) {
// the beginning of a phrase has been found
consumePhrase(state);
} else if (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0) {
// an and operation has been explicitly set
// if an operation has already been set this one is ignored
// if a term (or phrase or subquery) has not been found yet the
// operation is also ignored since there is no previous
// term (or phrase or subquery) to and with
if (state.currentOperation == null && state.top != null) {
state.currentOperation = BooleanClause.Occur.MUST;
}
++state.index;
} else if (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0) {
// an or operation has been explicitly set
// if an operation has already been set this one is ignored
// if a term (or phrase or subquery) has not been found yet the
// operation is also ignored since there is no previous
// term (or phrase or subquery) to or with
if (state.currentOperation == null && state.top != null) {
state.currentOperation = BooleanClause.Occur.SHOULD;
}
++state.index;
} else if (state.data[state.index] == '-' && (flags & NOT_OPERATOR) != 0) {
// a not operator has been found, so increase the not count
// two not operators in a row negate each other
++state.not;
++state.index;
// continue so the not operator is not reset
// before the next character is determined
continue;
} else if ((state.data[state.index] == ' '
|| state.data[state.index] == '\t'
|| state.data[state.index] == '\n'
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0) {
// ignore any whitespace found as it may have already been
// used a delimiter across a term (or phrase or subquery)
// or is simply extraneous
++state.index;
} else {
// the beginning of a token has been found
consumeToken(state);
}
// reset the not operator as even whitespace is not allowed when
// specifying the not operation for a term (or phrase or subquery)
state.not = 0;
}
}
private void consumeSubQuery(State state) {
assert (flags & PRECEDENCE_OPERATORS) != 0;
int start = ++state.index;
int precedence = 1;
boolean escaped = false;
while (state.index < state.length) {
if (!escaped) {
if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
// an escape character has been found so
// whatever character is next will become
// part of the subquery unless the escape
// character is the last one in the data
escaped = true;
++state.index;
continue;
} else if (state.data[state.index] == '(') {
// increase the precedence as there is a
// subquery in the current subquery
++precedence;
} else if (state.data[state.index] == ')') {
--precedence;
if (precedence == 0) {
// this should be the end of the subquery
// all characters found will used for
// creating the subquery
break;
}
}
}
escaped = false;
++state.index;
}
if (state.index == state.length) {
// a closing parenthesis was never found so the opening
// parenthesis is considered extraneous and will be ignored
state.index = start;
} else if (state.index == start) {
// a closing parenthesis was found immediately after the opening
// parenthesis so the current operation is reset since it would
// have been applied to this subquery
state.currentOperation = null;
++state.index;
} else {
// a complete subquery has been found and is recursively parsed by
// starting over with a new state object
State subState = new State(state.data, state.buffer, start, state.index);
parseSubQuery(subState);
buildQueryTree(state, subState.top);
++state.index;
}
}
private void consumePhrase(State state) {
assert (flags & PHRASE_OPERATOR) != 0;
int start = ++state.index;
int copied = 0;
boolean escaped = false;
while (state.index < state.length) {
if (!escaped) {
if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
// an escape character has been found so
// whatever character is next will become
// part of the phrase unless the escape
// character is the last one in the data
escaped = true;
++state.index;
continue;
} else if (state.data[state.index] == '"') {
// this should be the end of the phrase
// all characters found will used for
// creating the phrase query
break;
}
}
escaped = false;
state.buffer[copied++] = state.data[state.index++];
}
if (state.index == state.length) {
// a closing double quote was never found so the opening
// double quote is considered extraneous and will be ignored
state.index = start;
} else if (state.index == start) {
// a closing double quote was found immediately after the opening
// double quote so the current operation is reset since it would
// have been applied to this phrase
state.currentOperation = null;
++state.index;
} else {
// a complete phrase has been found and is parsed through
// through the analyzer from the given field
String phrase = new String(state.buffer, 0, copied);
Query branch = newPhraseQuery(phrase);
buildQueryTree(state, branch);
++state.index;
}
}
private void consumeToken(State state) {
int copied = 0;
boolean escaped = false;
boolean prefix = false;
while (state.index < state.length) {
if (!escaped) {
if (state.data[state.index] == '\\' && (flags & ESCAPE_OPERATOR) != 0) {
// an escape character has been found so
// whatever character is next will become
// part of the term unless the escape
// character is the last one in the data
escaped = true;
prefix = false;
++state.index;
continue;
} else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
|| (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
|| (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
|| (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
|| (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
|| ((state.data[state.index] == ' '
|| state.data[state.index] == '\t'
|| state.data[state.index] == '\n'
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
// this should be the end of the term
// all characters found will used for
// creating the term query
break;
}
// wildcard tracks whether or not the last character
// was a '*' operator that hasn't been escaped
// there must be at least one valid character before
// searching for a prefixed set of terms
prefix = copied > 0 && state.data[state.index] == '*' && (flags & PREFIX_OPERATOR) != 0;
}
escaped = false;
state.buffer[copied++] = state.data[state.index++];
}
if (copied > 0) {
final Query branch;
if (prefix) {
// if a term is found with a closing '*' it is considered to be a prefix query
// and will have prefix added as an option
String token = new String(state.buffer, 0, copied - 1);
branch = newPrefixQuery(token);
} else {
// a standard term has been found so it will be run through
// the entire analysis chain from the specified schema field
String token = new String(state.buffer, 0, copied);
branch = newDefaultQuery(token);
}
buildQueryTree(state, branch);
}
}
// buildQueryTree should be called after a term, phrase, or subquery
// is consumed to be added to our existing query tree
// this method will only add to the existing tree if the branch contained in state is not null
private void buildQueryTree(State state, Query branch) {
if (branch != null) {
// modify our branch to a BooleanQuery wrapper for not
// this is necessary any time a term, phrase, or subquery is negated
if (state.not % 2 == 1) {
BooleanQuery nq = new BooleanQuery();
nq.add(branch, BooleanClause.Occur.MUST_NOT);
nq.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
branch = nq;
}
// first term (or phrase or subquery) found and will begin our query tree
if (state.top == null) {
state.top = branch;
} else {
// more than one term (or phrase or subquery) found
// set currentOperation to the default if no other operation is explicitly set
if (state.currentOperation == null) {
state.currentOperation = defaultOperator;
}
// operational change requiring a new parent node
// this occurs if the previous operation is not the same as current operation
// because the previous operation must be evaluated separately to preserve
// the proper precedence and the current operation will take over as the top of the tree
if (state.previousOperation != state.currentOperation) {
BooleanQuery bq = new BooleanQuery();
bq.add(state.top, state.currentOperation);
state.top = bq;
}
// reset all of the state for reuse
((BooleanQuery)state.top).add(branch, state.currentOperation);
state.previousOperation = state.currentOperation;
}
// reset the current operation as it was intended to be applied to
// the incoming term (or phrase or subquery) even if branch was null
// due to other possible errors
state.currentOperation = null;
}
}
/**
* Factory method to generate a standard query (no phrase or prefix operators).
*/
protected Query newDefaultQuery(String text) {
BooleanQuery bq = new BooleanQuery(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = createBooleanQuery(entry.getKey(), text, defaultOperator);
if (q != null) {
q.setBoost(entry.getValue());
bq.add(q, BooleanClause.Occur.SHOULD);
}
}
return simplify(bq);
}
/**
* Factory method to generate a phrase query.
*/
protected Query newPhraseQuery(String text) {
BooleanQuery bq = new BooleanQuery(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = createPhraseQuery(entry.getKey(), text);
if (q != null) {
q.setBoost(entry.getValue());
bq.add(q, BooleanClause.Occur.SHOULD);
}
}
return simplify(bq);
}
/**
* Factory method to generate a prefix query.
*/
protected Query newPrefixQuery(String text) {
BooleanQuery bq = new BooleanQuery(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
PrefixQuery prefix = new PrefixQuery(new Term(entry.getKey(), text));
prefix.setBoost(entry.getValue());
bq.add(prefix, BooleanClause.Occur.SHOULD);
}
return simplify(bq);
}
/**
* Helper to simplify boolean queries with 0 or 1 clause
*/
protected Query simplify(BooleanQuery bq) {
if (bq.clauses().isEmpty()) {
return null;
} else if (bq.clauses().size() == 1) {
return bq.clauses().get(0).getQuery();
} else {
return bq;
}
}
/**
* Returns the implicit operator setting, which will be
* either {@code SHOULD} or {@code MUST}.
*/
public BooleanClause.Occur getDefaultOperator() {
return defaultOperator;
}
/**
* Sets the implicit operator setting, which must be
* either {@code SHOULD} or {@code MUST}.
*/
public void setDefaultOperator(BooleanClause.Occur operator) {
if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
}
this.defaultOperator = operator;
}
static class State {
final char[] data; // the characters in the query string
final char[] buffer; // a temporary buffer used to reduce necessary allocations
int index;
int length;
BooleanClause.Occur currentOperation;
BooleanClause.Occur previousOperation;
int not;
Query top;
State(char[] data, char[] buffer, int index, int length) {
this.data = data;
this.buffer = buffer;
this.index = index;
this.length = length;
}
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
A simple query parser for human-entered queries.
</body>
</html>

View File

@ -0,0 +1,558 @@
package org.apache.lucene.queryparser.simple;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.AND_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.ESCAPE_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.NOT_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.OR_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PHRASE_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PRECEDENCE_OPERATORS;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PREFIX_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.WHITESPACE_OPERATOR;
/** Tests for {@link SimpleQueryParser} */
public class TestSimpleQueryParser extends LuceneTestCase {
/**
* helper to parse a query with whitespace+lowercase analyzer across "field",
* with default operator of MUST
*/
private Query parse(String text) {
Analyzer analyzer = new MockAnalyzer(random());
SimpleQueryParser parser = new SimpleQueryParser(analyzer, "field");
parser.setDefaultOperator(Occur.MUST);
return parser.parse(text);
}
/** test a simple term */
public void testTerm() throws Exception {
Query expected = new TermQuery(new Term("field", "foobar"));
assertEquals(expected, parse("foobar"));
}
/** test a simple phrase */
public void testPhrase() throws Exception {
PhraseQuery expected = new PhraseQuery();
expected.add(new Term("field", "foo"));
expected.add(new Term("field", "bar"));
assertEquals(expected, parse("\"foo bar\""));
}
/** test a simple prefix */
public void testPrefix() throws Exception {
PrefixQuery expected = new PrefixQuery(new Term("field", "foobar"));
assertEquals(expected, parse("foobar*"));
}
/** test some AND'd terms using '+' operator */
public void testAND() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "foo")), Occur.MUST);
expected.add(new TermQuery(new Term("field", "bar")), Occur.MUST);
assertEquals(expected, parse("foo+bar"));
}
/** test some AND'd phrases using '+' operator */
public void testANDPhrase() throws Exception {
PhraseQuery phrase1 = new PhraseQuery();
phrase1.add(new Term("field", "foo"));
phrase1.add(new Term("field", "bar"));
PhraseQuery phrase2 = new PhraseQuery();
phrase2.add(new Term("field", "star"));
phrase2.add(new Term("field", "wars"));
BooleanQuery expected = new BooleanQuery();
expected.add(phrase1, Occur.MUST);
expected.add(phrase2, Occur.MUST);
assertEquals(expected, parse("\"foo bar\"+\"star wars\""));
}
/** test some AND'd terms (just using whitespace) */
public void testANDImplicit() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "foo")), Occur.MUST);
expected.add(new TermQuery(new Term("field", "bar")), Occur.MUST);
assertEquals(expected, parse("foo bar"));
}
/** test some OR'd terms */
public void testOR() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
assertEquals(expected, parse("foo|bar"));
assertEquals(expected, parse("foo||bar"));
}
/** test some OR'd terms (just using whitespace) */
public void testORImplicit() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
SimpleQueryParser parser = new SimpleQueryParser(new MockAnalyzer(random()), "field");
assertEquals(expected, parser.parse("foo bar"));
}
/** test some OR'd phrases using '|' operator */
public void testORPhrase() throws Exception {
PhraseQuery phrase1 = new PhraseQuery();
phrase1.add(new Term("field", "foo"));
phrase1.add(new Term("field", "bar"));
PhraseQuery phrase2 = new PhraseQuery();
phrase2.add(new Term("field", "star"));
phrase2.add(new Term("field", "wars"));
BooleanQuery expected = new BooleanQuery();
expected.add(phrase1, Occur.SHOULD);
expected.add(phrase2, Occur.SHOULD);
assertEquals(expected, parse("\"foo bar\"|\"star wars\""));
}
/** test negated term */
public void testNOT() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "foo")), Occur.MUST_NOT);
expected.add(new MatchAllDocsQuery(), Occur.SHOULD);
assertEquals(expected, parse("-foo"));
assertEquals(expected, parse("-(foo)"));
assertEquals(expected, parse("---foo"));
}
/** test crazy prefixes with multiple asterisks */
public void testCrazyPrefixes1() throws Exception {
Query expected = new PrefixQuery(new Term("field", "st*ar"));
assertEquals(expected, parse("st*ar*"));
}
/** test prefixes with some escaping */
public void testCrazyPrefixes2() throws Exception {
Query expected = new PrefixQuery(new Term("field", "st*ar\\*"));
assertEquals(expected, parse("st*ar\\\\**"));
}
/** not a prefix query! the prefix operator is escaped */
public void testTermInDisguise() throws Exception {
Query expected = new TermQuery(new Term("field", "st*ar\\*"));
assertEquals(expected, parse("sT*Ar\\\\\\*"));
}
// a number of test cases here have garbage/errors in
// the syntax passed in to test that the query can
// still be interpreted as a guess to what the human
// input was trying to be
public void testGarbageTerm() throws Exception {
Query expected = new TermQuery(new Term("field", "star"));
assertEquals(expected, parse("star"));
assertEquals(expected, parse("star\n"));
assertEquals(expected, parse("star\r"));
assertEquals(expected, parse("star\t"));
assertEquals(expected, parse("star("));
assertEquals(expected, parse("star)"));
assertEquals(expected, parse("star\""));
assertEquals(expected, parse("\t \r\n\nstar \n \r \t "));
assertEquals(expected, parse("- + \"\" - star \\"));
}
public void testGarbageEmpty() throws Exception {
assertNull(parse(""));
assertNull(parse(" "));
assertNull(parse(" "));
assertNull(parse("\\ "));
assertNull(parse("\\ \\ "));
assertNull(parse("\"\""));
assertNull(parse("\" \""));
assertNull(parse("\" \"|\" \""));
assertNull(parse("(\" \"|\" \")"));
assertNull(parse("\" \" \" \""));
assertNull(parse("(\" \" \" \")"));
}
public void testGarbageAND() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.MUST);
expected.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
assertEquals(expected, parse("star wars"));
assertEquals(expected, parse("star+wars"));
assertEquals(expected, parse(" star wars "));
assertEquals(expected, parse(" star + wars "));
assertEquals(expected, parse(" | star + + | wars "));
assertEquals(expected, parse(" | star + + | wars \\"));
}
public void testGarbageOR() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "wars")), Occur.SHOULD);
assertEquals(expected, parse("star|wars"));
assertEquals(expected, parse(" star | wars "));
assertEquals(expected, parse(" | star | + | wars "));
assertEquals(expected, parse(" + star | + + wars \\"));
}
public void testGarbageNOT() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.MUST_NOT);
expected.add(new MatchAllDocsQuery(), Occur.SHOULD);
assertEquals(expected, parse("-star"));
assertEquals(expected, parse("---star"));
assertEquals(expected, parse("- -star -"));
}
public void testGarbagePhrase() throws Exception {
PhraseQuery expected = new PhraseQuery();
expected.add(new Term("field", "star"));
expected.add(new Term("field", "wars"));
assertEquals(expected, parse("\"star wars\""));
assertEquals(expected, parse("\"star wars\\ \""));
assertEquals(expected, parse("\"\" | \"star wars\""));
assertEquals(expected, parse(" \"star wars\" \"\"\\"));
}
public void testGarbageSubquery() throws Exception {
Query expected = new TermQuery(new Term("field", "star"));
assertEquals(expected, parse("(star)"));
assertEquals(expected, parse("(star))"));
assertEquals(expected, parse("((star)"));
assertEquals(expected, parse(" -()(star) \n\n\r "));
assertEquals(expected, parse("| + - ( + - | star \n ) \n"));
}
public void testCompoundAnd() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.MUST);
expected.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
expected.add(new TermQuery(new Term("field", "empire")), Occur.MUST);
assertEquals(expected, parse("star wars empire"));
assertEquals(expected, parse("star+wars + empire"));
assertEquals(expected, parse(" | --star wars empire \n\\"));
}
public void testCompoundOr() throws Exception {
BooleanQuery expected = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "wars")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
assertEquals(expected, parse("star|wars|empire"));
assertEquals(expected, parse("star|wars | empire"));
assertEquals(expected, parse(" | --star|wars|empire \n\\"));
}
public void testComplex00() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner = new BooleanQuery();
inner.add(new TermQuery(new Term("field", "star")), Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "wars")), Occur.SHOULD);
expected.add(inner, Occur.MUST);
expected.add(new TermQuery(new Term("field", "empire")), Occur.MUST);
assertEquals(expected, parse("star|wars empire"));
assertEquals(expected, parse("star|wars + empire"));
assertEquals(expected, parse("star| + wars + ----empire |"));
}
public void testComplex01() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner = new BooleanQuery();
inner.add(new TermQuery(new Term("field", "star")), Occur.MUST);
inner.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
expected.add(inner, Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
assertEquals(expected, parse("star wars | empire"));
assertEquals(expected, parse("star + wars|empire"));
assertEquals(expected, parse("star + | wars | ----empire +"));
}
public void testComplex02() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner = new BooleanQuery();
inner.add(new TermQuery(new Term("field", "star")), Occur.MUST);
inner.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
expected.add(inner, Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "strikes")), Occur.SHOULD);
assertEquals(expected, parse("star wars | empire | strikes"));
assertEquals(expected, parse("star + wars|empire | strikes"));
assertEquals(expected, parse("star + | wars | ----empire | + --strikes \\"));
}
public void testComplex03() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner = new BooleanQuery();
BooleanQuery inner2 = new BooleanQuery();
inner2.add(new TermQuery(new Term("field", "star")), Occur.MUST);
inner2.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
inner.add(inner2, Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
inner.add(new TermQuery(new Term("field", "strikes")), Occur.SHOULD);
expected.add(inner, Occur.MUST);
expected.add(new TermQuery(new Term("field", "back")), Occur.MUST);
assertEquals(expected, parse("star wars | empire | strikes back"));
assertEquals(expected, parse("star + wars|empire | strikes + back"));
assertEquals(expected, parse("star + | wars | ----empire | + --strikes + | --back \\"));
}
public void testComplex04() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner = new BooleanQuery();
BooleanQuery inner2 = new BooleanQuery();
inner.add(new TermQuery(new Term("field", "star")), Occur.MUST);
inner.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
inner2.add(new TermQuery(new Term("field", "strikes")), Occur.MUST);
inner2.add(new TermQuery(new Term("field", "back")), Occur.MUST);
expected.add(inner, Occur.SHOULD);
expected.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
expected.add(inner2, Occur.SHOULD);
assertEquals(expected, parse("(star wars) | empire | (strikes back)"));
assertEquals(expected, parse("(star + wars) |empire | (strikes + back)"));
assertEquals(expected, parse("(star + | wars |) | ----empire | + --(strikes + | --back) \\"));
}
public void testComplex05() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner1 = new BooleanQuery();
BooleanQuery inner2 = new BooleanQuery();
BooleanQuery inner3 = new BooleanQuery();
BooleanQuery inner4 = new BooleanQuery();
expected.add(inner1, Occur.SHOULD);
expected.add(inner2, Occur.SHOULD);
inner1.add(new TermQuery(new Term("field", "star")), Occur.MUST);
inner1.add(new TermQuery(new Term("field", "wars")), Occur.MUST);
inner2.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
inner2.add(inner3, Occur.SHOULD);
inner3.add(new TermQuery(new Term("field", "strikes")), Occur.MUST);
inner3.add(new TermQuery(new Term("field", "back")), Occur.MUST);
inner3.add(inner4, Occur.MUST);
inner4.add(new TermQuery(new Term("field", "jarjar")), Occur.MUST_NOT);
inner4.add(new MatchAllDocsQuery(), Occur.SHOULD);
assertEquals(expected, parse("(star wars) | (empire | (strikes back -jarjar))"));
assertEquals(expected, parse("(star + wars) |(empire | (strikes + back -jarjar) () )"));
assertEquals(expected, parse("(star + | wars |) | --(--empire | + --(strikes + | --back + -jarjar) \"\" ) \""));
}
public void testComplex06() throws Exception {
BooleanQuery expected = new BooleanQuery();
BooleanQuery inner1 = new BooleanQuery();
BooleanQuery inner2 = new BooleanQuery();
BooleanQuery inner3 = new BooleanQuery();
expected.add(new TermQuery(new Term("field", "star")), Occur.MUST);
expected.add(inner1, Occur.MUST);
inner1.add(new TermQuery(new Term("field", "wars")), Occur.SHOULD);
inner1.add(inner2, Occur.SHOULD);
inner2.add(inner3, Occur.MUST);
inner3.add(new TermQuery(new Term("field", "empire")), Occur.SHOULD);
inner3.add(new TermQuery(new Term("field", "strikes")), Occur.SHOULD);
inner2.add(new TermQuery(new Term("field", "back")), Occur.MUST);
inner2.add(new TermQuery(new Term("field", "jar+|jar")), Occur.MUST);
assertEquals(expected, parse("star (wars | (empire | strikes back jar\\+\\|jar))"));
assertEquals(expected, parse("star + (wars |(empire | strikes + back jar\\+\\|jar) () )"));
assertEquals(expected, parse("star + (| wars | | --(--empire | + --strikes + | --back + jar\\+\\|jar) \"\" ) \""));
}
/** test a term with field weights */
public void testWeightedTerm() throws Exception {
Map<String,Float> weights = new LinkedHashMap<>();
weights.put("field0", 5f);
weights.put("field1", 10f);
BooleanQuery expected = new BooleanQuery(true);
Query field0 = new TermQuery(new Term("field0", "foo"));
field0.setBoost(5f);
expected.add(field0, Occur.SHOULD);
Query field1 = new TermQuery(new Term("field1", "foo"));
field1.setBoost(10f);
expected.add(field1, Occur.SHOULD);
Analyzer analyzer = new MockAnalyzer(random());
SimpleQueryParser parser = new SimpleQueryParser(analyzer, weights);
assertEquals(expected, parser.parse("foo"));
}
/** test a more complex query with field weights */
public void testWeightedOR() throws Exception {
Map<String,Float> weights = new LinkedHashMap<>();
weights.put("field0", 5f);
weights.put("field1", 10f);
BooleanQuery expected = new BooleanQuery();
BooleanQuery foo = new BooleanQuery(true);
Query field0 = new TermQuery(new Term("field0", "foo"));
field0.setBoost(5f);
foo.add(field0, Occur.SHOULD);
Query field1 = new TermQuery(new Term("field1", "foo"));
field1.setBoost(10f);
foo.add(field1, Occur.SHOULD);
expected.add(foo, Occur.SHOULD);
BooleanQuery bar = new BooleanQuery(true);
field0 = new TermQuery(new Term("field0", "bar"));
field0.setBoost(5f);
bar.add(field0, Occur.SHOULD);
field1 = new TermQuery(new Term("field1", "bar"));
field1.setBoost(10f);
bar.add(field1, Occur.SHOULD);
expected.add(bar, Occur.SHOULD);
Analyzer analyzer = new MockAnalyzer(random());
SimpleQueryParser parser = new SimpleQueryParser(analyzer, weights);
assertEquals(expected, parser.parse("foo|bar"));
}
/** helper to parse a query with keyword analyzer across "field" */
private Query parseKeyword(String text, int flags) {
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
SimpleQueryParser parser = new SimpleQueryParser(analyzer,
Collections.singletonMap("field", 1f),
flags);
return parser.parse(text);
}
/** test the ability to enable/disable phrase operator */
public void testDisablePhrase() {
Query expected = new TermQuery(new Term("field", "\"test\""));
assertEquals(expected, parseKeyword("\"test\"", ~PHRASE_OPERATOR));
}
/** test the ability to enable/disable prefix operator */
public void testDisablePrefix() {
Query expected = new TermQuery(new Term("field", "test*"));
assertEquals(expected, parseKeyword("test*", ~PREFIX_OPERATOR));
}
/** test the ability to enable/disable AND operator */
public void testDisableAND() {
Query expected = new TermQuery(new Term("field", "foo+bar"));
assertEquals(expected, parseKeyword("foo+bar", ~AND_OPERATOR));
expected = new TermQuery(new Term("field", "+foo+bar"));
assertEquals(expected, parseKeyword("+foo+bar", ~AND_OPERATOR));
}
/** test the ability to enable/disable OR operator */
public void testDisableOR() {
Query expected = new TermQuery(new Term("field", "foo|bar"));
assertEquals(expected, parseKeyword("foo|bar", ~OR_OPERATOR));
expected = new TermQuery(new Term("field", "|foo|bar"));
assertEquals(expected, parseKeyword("|foo|bar", ~OR_OPERATOR));
}
/** test the ability to enable/disable NOT operator */
public void testDisableNOT() {
Query expected = new TermQuery(new Term("field", "-foo"));
assertEquals(expected, parseKeyword("-foo", ~NOT_OPERATOR));
}
/** test the ability to enable/disable precedence operators */
public void testDisablePrecedence() {
Query expected = new TermQuery(new Term("field", "(foo)"));
assertEquals(expected, parseKeyword("(foo)", ~PRECEDENCE_OPERATORS));
expected = new TermQuery(new Term("field", ")foo("));
assertEquals(expected, parseKeyword(")foo(", ~PRECEDENCE_OPERATORS));
}
/** test the ability to enable/disable escape operators */
public void testDisableEscape() {
Query expected = new TermQuery(new Term("field", "foo\\bar"));
assertEquals(expected, parseKeyword("foo\\bar", ~ESCAPE_OPERATOR));
assertEquals(expected, parseKeyword("(foo\\bar)", ~ESCAPE_OPERATOR));
assertEquals(expected, parseKeyword("\"foo\\bar\"", ~ESCAPE_OPERATOR));
}
public void testDisableWhitespace() {
Query expected = new TermQuery(new Term("field", "foo foo"));
assertEquals(expected, parseKeyword("foo foo", ~WHITESPACE_OPERATOR));
expected = new TermQuery(new Term("field", " foo foo\n "));
assertEquals(expected, parseKeyword(" foo foo\n ", ~WHITESPACE_OPERATOR));
expected = new TermQuery(new Term("field", "\t\tfoo foo foo"));
assertEquals(expected, parseKeyword("\t\tfoo foo foo", ~WHITESPACE_OPERATOR));
}
// we aren't supposed to barf on any input...
public void testRandomQueries() throws Exception {
for (int i = 0; i < 1000; i++) {
String query = _TestUtil.randomUnicodeString(random());
parse(query); // no exception
parseKeyword(query, _TestUtil.nextInt(random(), 0, 256)); // no exception
}
}
public void testRandomQueries2() throws Exception {
char chars[] = new char[] { 'a', '1', '|', '&', ' ', '(', ')', '"', '-' };
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1000; i++) {
sb.setLength(0);
int queryLength = random().nextInt(20);
for (int j = 0; j < queryLength; j++) {
sb.append(chars[random().nextInt(chars.length)]);
}
parse(sb.toString()); // no exception
parseKeyword(sb.toString(), _TestUtil.nextInt(random(), 0, 256)); // no exception
}
}
}