LUCENE-5410: add fuzzy and near to SimpleQueryParser

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1563558 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-02 02:57:02 +00:00
parent c70deaf3aa
commit 584fda3bdf
9 changed files with 281 additions and 50 deletions

View File

@ -125,6 +125,9 @@ New Features
* LUCENE-5320: Add SearcherTaxonomyManager over search and taxonomy index
directories (i.e. not only NRT). (Shai Erera)
* LUCENE-5410: Add fuzzy and near support via '~' operator to SimpleQueryParser.
(Lee Hinman via Robert Muir)
Build
* LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;

View File

@ -58,7 +58,12 @@ public class MultiPhraseQuery extends Query {
/** Sets the phrase slop for this query.
* @see PhraseQuery#setSlop(int)
*/
public void setSlop(int s) { slop = s; }
public void setSlop(int s) {
if (s < 0) {
throw new IllegalArgumentException("slop value cannot be negative");
}
slop = s;
}
/** Sets the phrase slop for this query.
* @see PhraseQuery#getSlop()

View File

@ -68,7 +68,12 @@ public class PhraseQuery extends Query {
results are sorted by exactness.
<p>The slop is zero by default, requiring exact matches.*/
public void setSlop(int s) { slop = s; }
public void setSlop(int s) {
if (s < 0) {
throw new IllegalArgumentException("slop value cannot be negative");
}
slop = s;
}
/** Returns the slop. See setSlop(). */
public int getSlop() { return slop; }

View File

@ -568,4 +568,16 @@ public class TestMultiPhraseQuery extends LuceneTestCase {
return terms;
}
public void testNegativeSlop() throws Exception {
MultiPhraseQuery query = new MultiPhraseQuery();
query.add(new Term("field", "two"));
query.add(new Term("field", "one"));
try {
query.setSlop(-2);
fail("didn't get expected exception");
} catch (IllegalArgumentException expected) {
// expected exception
}
}
}

View File

@ -678,4 +678,16 @@ public class TestPhraseQuery extends LuceneTestCase {
reader.close();
dir.close();
}
public void testNegativeSlop() throws Exception {
PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "two"));
query.add(new Term("field", "one"));
try {
query.setSlop(-2);
fail("didn't get expected exception");
} catch (IllegalArgumentException expected) {
// expected exception
}
}
}

View File

@ -21,10 +21,12 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.QueryBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import java.util.Collections;
import java.util.Map;
@ -48,6 +50,8 @@ import java.util.Map;
* <li>'{@code -}' negates a single token: <tt>-token0</tt>
* <li>'{@code "}' creates phrases of terms: <tt>"term1 term2 ..."</tt>
* <li>'{@code *}' at the end of terms specifies prefix query: <tt>term*</tt>
* <li>'{@code ~}N' at the end of terms specifies fuzzy query: <tt>term~1</tt>
* <li>'{@code ~}N' at the end of phrases specifies near query: <tt>"term1 term2"~5</tt>
* <li>'{@code (}' and '{@code )}' specifies precedence: <tt>token1 + (token2 | token3)</tt>
* </ul>
* <p>
@ -111,6 +115,11 @@ public class SimpleQueryParser extends QueryBuilder {
public static final int ESCAPE_OPERATOR = 1<<6;
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
public static final int WHITESPACE_OPERATOR = 1<<7;
/** Enables {@code FUZZY} operators: (~) on single terms */
public static final int FUZZY_OPERATOR = 1<<8;
/** Enables {@code NEAR} operators: (~) on phrases */
public static final int NEAR_OPERATOR = 1<<9;
private BooleanClause.Occur defaultOperator = BooleanClause.Occur.SHOULD;
@ -266,6 +275,7 @@ public class SimpleQueryParser extends QueryBuilder {
int start = ++state.index;
int copied = 0;
boolean escaped = false;
boolean hasSlop = false;
while (state.index < state.length) {
if (!escaped) {
@ -279,10 +289,23 @@ public class SimpleQueryParser extends QueryBuilder {
continue;
} else if (state.data[state.index] == '"') {
// this should be the end of the phrase
// all characters found will used for
// creating the phrase query
break;
// if there are still characters after the closing ", check for a
// tilde
if (state.length > (state.index + 1) &&
state.data[state.index+1] == '~' &&
(flags & NEAR_OPERATOR) != 0) {
state.index++;
// check for characters after the tilde
if (state.length > (state.index + 1)) {
hasSlop = true;
}
break;
} else {
// this should be the end of the phrase
// all characters found will used for
// creating the phrase query
break;
}
}
}
@ -305,7 +328,12 @@ public class SimpleQueryParser extends QueryBuilder {
// a complete phrase has been found and is parsed through
// through the analyzer from the given field
String phrase = new String(state.buffer, 0, copied);
Query branch = newPhraseQuery(phrase);
Query branch;
if (hasSlop) {
branch = newPhraseQuery(phrase, parseFuzziness(state));
} else {
branch = newPhraseQuery(phrase, 0);
}
buildQueryTree(state, branch);
++state.index;
@ -316,6 +344,7 @@ public class SimpleQueryParser extends QueryBuilder {
int copied = 0;
boolean escaped = false;
boolean prefix = false;
boolean fuzzy = false;
while (state.index < state.length) {
if (!escaped) {
@ -329,19 +358,14 @@ public class SimpleQueryParser extends QueryBuilder {
++state.index;
continue;
} else if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
|| (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
|| (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
|| (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
|| (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
|| ((state.data[state.index] == ' '
|| state.data[state.index] == '\t'
|| state.data[state.index] == '\n'
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
} else if (tokenFinished(state)) {
// this should be the end of the term
// all characters found will used for
// creating the term query
break;
} else if (copied > 0 && state.data[state.index] == '~' && (flags & FUZZY_OPERATOR) != 0) {
fuzzy = true;
break;
}
// wildcard tracks whether or not the last character
@ -358,7 +382,17 @@ public class SimpleQueryParser extends QueryBuilder {
if (copied > 0) {
final Query branch;
if (prefix) {
if (fuzzy && (flags & FUZZY_OPERATOR) != 0) {
String token = new String(state.buffer, 0, copied);
int fuzziness = parseFuzziness(state);
// edit distance has a maximum, limit to the maximum supported
fuzziness = Math.min(fuzziness, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
if (fuzziness == 0) {
branch = newDefaultQuery(token);
} else {
branch = newFuzzyQuery(token, fuzziness);
}
} else if (prefix) {
// if a term is found with a closing '*' it is considered to be a prefix query
// and will have prefix added as an option
String token = new String(state.buffer, 0, copied - 1);
@ -420,6 +454,60 @@ public class SimpleQueryParser extends QueryBuilder {
}
}
/**
* Helper parsing fuzziness from parsing state
* @return slop/edit distance, 0 in the case of non-parsing slop/edit string
*/
private int parseFuzziness(State state) {
char slopText[] = new char[state.length];
int slopLength = 0;
if (state.data[state.index] == '~') {
while (state.index < state.length) {
state.index++;
// it's possible that the ~ was at the end, so check after incrementing
// to make sure we don't go out of bounds
if (state.index < state.length) {
if (tokenFinished(state)) {
break;
}
slopText[slopLength] = state.data[state.index];
slopLength++;
}
}
int fuzziness = 0;
try {
fuzziness = Integer.parseInt(new String(slopText, 0, slopLength));
} catch (NumberFormatException e) {
// swallow number format exceptions parsing fuzziness
}
// negative -> 0
if (fuzziness < 0) {
fuzziness = 0;
}
return fuzziness;
}
return 0;
}
/**
* Helper returning true if the state has reached the end of token.
*/
private boolean tokenFinished(State state) {
if ((state.data[state.index] == '"' && (flags & PHRASE_OPERATOR) != 0)
|| (state.data[state.index] == '|' && (flags & OR_OPERATOR) != 0)
|| (state.data[state.index] == '+' && (flags & AND_OPERATOR) != 0)
|| (state.data[state.index] == '(' && (flags & PRECEDENCE_OPERATORS) != 0)
|| (state.data[state.index] == ')' && (flags & PRECEDENCE_OPERATORS) != 0)
|| ((state.data[state.index] == ' '
|| state.data[state.index] == '\t'
|| state.data[state.index] == '\n'
|| state.data[state.index] == '\r') && (flags & WHITESPACE_OPERATOR) != 0)) {
return true;
}
return false;
}
/**
* Factory method to generate a standard query (no phrase or prefix operators).
*/
@ -436,12 +524,27 @@ public class SimpleQueryParser extends QueryBuilder {
}
/**
* Factory method to generate a phrase query.
* Factory method to generate a fuzzy query.
*/
protected Query newPhraseQuery(String text) {
protected Query newFuzzyQuery(String text, int fuzziness) {
BooleanQuery bq = new BooleanQuery(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = createPhraseQuery(entry.getKey(), text);
Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
if (q != null) {
q.setBoost(entry.getValue());
bq.add(q, BooleanClause.Occur.SHOULD);
}
}
return simplify(bq);
}
/**
* Factory method to generate a phrase query with slop.
*/
protected Query newPhraseQuery(String text, int slop) {
BooleanQuery bq = new BooleanQuery(true);
for (Map.Entry<String,Float> entry : weights.entrySet()) {
Query q = createPhraseQuery(entry.getKey(), text, slop);
if (q != null) {
q.setBoost(entry.getValue());
bq.add(q, BooleanClause.Occur.SHOULD);

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
@ -34,14 +35,17 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.AND_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.ESCAPE_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.FUZZY_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.NOT_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.OR_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PHRASE_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PRECEDENCE_OPERATORS;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.PREFIX_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.NEAR_OPERATOR;
import static org.apache.lucene.queryparser.simple.SimpleQueryParser.WHITESPACE_OPERATOR;
/** Tests for {@link SimpleQueryParser} */
@ -58,6 +62,18 @@ public class TestSimpleQueryParser extends LuceneTestCase {
return parser.parse(text);
}
/**
* helper to parse a query with whitespace+lowercase analyzer across "field",
* with default operator of MUST
*/
private Query parse(String text, int flags) {
Analyzer analyzer = new MockAnalyzer(random());
SimpleQueryParser parser = new SimpleQueryParser(analyzer,
Collections.singletonMap("field", 1f), flags);
parser.setDefaultOperator(Occur.MUST);
return parser.parse(text);
}
/** test a simple term */
public void testTerm() throws Exception {
Query expected = new TermQuery(new Term("field", "foobar"));
@ -65,6 +81,24 @@ public class TestSimpleQueryParser extends LuceneTestCase {
assertEquals(expected, parse("foobar"));
}
/** test a fuzzy query */
public void testFuzzy() throws Exception {
Query regular = new TermQuery(new Term("field", "foobar"));
Query expected = new FuzzyQuery(new Term("field", "foobar"), 2);
assertEquals(expected, parse("foobar~2"));
assertEquals(regular, parse("foobar~"));
assertEquals(regular, parse("foobar~a"));
assertEquals(regular, parse("foobar~1a"));
BooleanQuery bool = new BooleanQuery();
FuzzyQuery fuzzy = new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
bool.add(fuzzy, Occur.MUST);
bool.add(new TermQuery(new Term("field", "bar")), Occur.MUST);
assertEquals(bool, parse("foo~" + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1 + " bar"));
}
/** test a simple phrase */
public void testPhrase() throws Exception {
PhraseQuery expected = new PhraseQuery();
@ -74,6 +108,43 @@ public class TestSimpleQueryParser extends LuceneTestCase {
assertEquals(expected, parse("\"foo bar\""));
}
/** test a simple phrase with various slop settings */
public void testPhraseWithSlop() throws Exception {
PhraseQuery expectedWithSlop = new PhraseQuery();
expectedWithSlop.add(new Term("field", "foo"));
expectedWithSlop.add(new Term("field", "bar"));
expectedWithSlop.setSlop(2);
assertEquals(expectedWithSlop, parse("\"foo bar\"~2"));
PhraseQuery expectedWithMultiDigitSlop = new PhraseQuery();
expectedWithMultiDigitSlop.add(new Term("field", "foo"));
expectedWithMultiDigitSlop.add(new Term("field", "bar"));
expectedWithMultiDigitSlop.setSlop(10);
assertEquals(expectedWithMultiDigitSlop, parse("\"foo bar\"~10"));
PhraseQuery expectedNoSlop = new PhraseQuery();
expectedNoSlop.add(new Term("field", "foo"));
expectedNoSlop.add(new Term("field", "bar"));
assertEquals("Ignore trailing tilde with no slop", expectedNoSlop, parse("\"foo bar\"~"));
assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~a"));
assertEquals("Ignore non-numeric trailing slop", expectedNoSlop, parse("\"foo bar\"~1a"));
assertEquals("Ignore negative trailing slop", expectedNoSlop, parse("\"foo bar\"~-1"));
PhraseQuery pq = new PhraseQuery();
pq.add(new Term("field", "foo"));
pq.add(new Term("field", "bar"));
pq.setSlop(12);
BooleanQuery expectedBoolean = new BooleanQuery();
expectedBoolean.add(pq, Occur.MUST);
expectedBoolean.add(new TermQuery(new Term("field", "baz")), Occur.MUST);
assertEquals(expectedBoolean, parse("\"foo bar\"~12 baz"));
}
/** test a simple prefix */
public void testPrefix() throws Exception {
PrefixQuery expected = new PrefixQuery(new Term("field", "foobar"));
@ -533,17 +604,33 @@ public class TestSimpleQueryParser extends LuceneTestCase {
assertEquals(expected, parseKeyword("\t\tfoo foo foo", ~WHITESPACE_OPERATOR));
}
public void testDisableFuzziness() {
Query expected = new TermQuery(new Term("field", "foo~1"));
assertEquals(expected, parseKeyword("foo~1", ~FUZZY_OPERATOR));
}
public void testDisableSlop() {
PhraseQuery expectedPhrase = new PhraseQuery();
expectedPhrase.add(new Term("field", "foo"));
expectedPhrase.add(new Term("field", "bar"));
BooleanQuery expected = new BooleanQuery();
expected.add(expectedPhrase, Occur.MUST);
expected.add(new TermQuery(new Term("field", "~2")), Occur.MUST);
assertEquals(expected, parse("\"foo bar\"~2", ~NEAR_OPERATOR));
}
// we aren't supposed to barf on any input...
public void testRandomQueries() throws Exception {
for (int i = 0; i < 1000; i++) {
String query = _TestUtil.randomUnicodeString(random());
parse(query); // no exception
parseKeyword(query, _TestUtil.nextInt(random(), 0, 256)); // no exception
parseKeyword(query, _TestUtil.nextInt(random(), 0, 1024)); // no exception
}
}
public void testRandomQueries2() throws Exception {
char chars[] = new char[] { 'a', '1', '|', '&', ' ', '(', ')', '"', '-' };
char chars[] = new char[] { 'a', '1', '|', '&', ' ', '(', ')', '"', '-', '~'};
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1000; i++) {
sb.setLength(0);
@ -552,7 +639,7 @@ public class TestSimpleQueryParser extends LuceneTestCase {
sb.append(chars[random().nextInt(chars.length)]);
}
parse(sb.toString()); // no exception
parseKeyword(sb.toString(), _TestUtil.nextInt(random(), 0, 256)); // no exception
parseKeyword(sb.toString(), _TestUtil.nextInt(random(), 0, 1024)); // no exception
}
}
}

View File

@ -18,11 +18,9 @@ package org.apache.solr.search;
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.simple.SimpleQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SimpleParams;
@ -73,36 +71,21 @@ public class SimpleQParserPlugin extends QParserPlugin {
/** The name that can be used to specify this plugin should be used to parse the query. */
public static String NAME = "simple";
/** Enables {@code AND} operator (+) */
private static final String AND_OPERATOR = "AND";
/** Enables {@code NOT} operator (-) */
private static final String NOT_OPERATOR = "NOT";
/** Enables {@code OR} operator (|) */
private static final String OR_OPERATOR = "OR";
/** Enables {@code PREFIX} operator (*) */
private static final String PREFIX_OPERATOR = "PREFIX";
/** Enables {@code PHRASE} operator (") */
private static final String PHRASE_OPERATOR = "PHRASE";
/** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */
private static final String PRECEDENCE_OPERATORS = "PRECEDENCE";
/** Enables {@code ESCAPE} operator (\) */
private static final String ESCAPE_OPERATOR = "ESCAPE";
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
private static final String WHITESPACE_OPERATOR = "WHITESPACE";
/** Map of string operators to their int counterparts in SimpleQueryParser. */
private static final Map<String, Integer> OPERATORS = new HashMap<String, Integer>();
/* Setup the map of possible operators. */
static {
OPERATORS.put(AND_OPERATOR, SimpleQueryParser.AND_OPERATOR);
OPERATORS.put(NOT_OPERATOR, SimpleQueryParser.NOT_OPERATOR);
OPERATORS.put(OR_OPERATOR, SimpleQueryParser.OR_OPERATOR);
OPERATORS.put(PREFIX_OPERATOR, SimpleQueryParser.PREFIX_OPERATOR);
OPERATORS.put(PHRASE_OPERATOR, SimpleQueryParser.PHRASE_OPERATOR);
OPERATORS.put(PRECEDENCE_OPERATORS, SimpleQueryParser.PRECEDENCE_OPERATORS);
OPERATORS.put(ESCAPE_OPERATOR, SimpleQueryParser.ESCAPE_OPERATOR);
OPERATORS.put(WHITESPACE_OPERATOR, SimpleQueryParser.WHITESPACE_OPERATOR);
OPERATORS.put(SimpleParams.AND_OPERATOR, SimpleQueryParser.AND_OPERATOR);
OPERATORS.put(SimpleParams.NOT_OPERATOR, SimpleQueryParser.NOT_OPERATOR);
OPERATORS.put(SimpleParams.OR_OPERATOR, SimpleQueryParser.OR_OPERATOR);
OPERATORS.put(SimpleParams.PREFIX_OPERATOR, SimpleQueryParser.PREFIX_OPERATOR);
OPERATORS.put(SimpleParams.PHRASE_OPERATOR, SimpleQueryParser.PHRASE_OPERATOR);
OPERATORS.put(SimpleParams.PRECEDENCE_OPERATORS, SimpleQueryParser.PRECEDENCE_OPERATORS);
OPERATORS.put(SimpleParams.ESCAPE_OPERATOR, SimpleQueryParser.ESCAPE_OPERATOR);
OPERATORS.put(SimpleParams.WHITESPACE_OPERATOR, SimpleQueryParser.WHITESPACE_OPERATOR);
OPERATORS.put(SimpleParams.FUZZY_OPERATOR, SimpleQueryParser.FUZZY_OPERATOR);
OPERATORS.put(SimpleParams.NEAR_OPERATOR, SimpleQueryParser.NEAR_OPERATOR);
}
/** No initialization is necessary so this method is empty. */

View File

@ -26,4 +26,25 @@ public interface SimpleParams {
/** Override the currently enabled/disabled query operators. */
public static String QO = "q.operators";
/** Enables {@code AND} operator (+) */
public static final String AND_OPERATOR = "AND";
/** Enables {@code NOT} operator (-) */
public static final String NOT_OPERATOR = "NOT";
/** Enables {@code OR} operator (|) */
public static final String OR_OPERATOR = "OR";
/** Enables {@code PREFIX} operator (*) */
public static final String PREFIX_OPERATOR = "PREFIX";
/** Enables {@code PHRASE} operator (") */
public static final String PHRASE_OPERATOR = "PHRASE";
/** Enables {@code PRECEDENCE} operators: {@code (} and {@code )} */
public static final String PRECEDENCE_OPERATORS = "PRECEDENCE";
/** Enables {@code ESCAPE} operator (\) */
public static final String ESCAPE_OPERATOR = "ESCAPE";
/** Enables {@code WHITESPACE} operators: ' ' '\n' '\r' '\t' */
public static final String WHITESPACE_OPERATOR = "WHITESPACE";
/** Enables {@code FUZZY} operator (~) */
public static final String FUZZY_OPERATOR = "FUZZY";
/** Enables {@code NEAR} operator (~) */
public static final String NEAR_OPERATOR = "NEAR";
}