LUCENE-2261: make TopTermsScoringBooleanRewrite's PQ size configurable

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@912311 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-02-21 07:27:46 +00:00
parent 349ca0943e
commit 7b06e1c5db
5 changed files with 89 additions and 46 deletions

View File

@ -135,8 +135,11 @@ Optimizations
* LUCENE-2137: Switch to AtomicInteger for some ref counting (Earwin
Burrfoot via Mike McCandless)
* LUCENE-2123: Move FuzzyQuery rewrite as separate RewriteMode into
MTQ. (Uwe Schindler, Robert Muir, Mike McCandless)
* LUCENE-2123, LUCENE-2261: Move FuzzyQuery rewrite to separate RewriteMode
into MultiTermQuery. The number of fuzzy expansions can be specified with
the maxExpansions parameter to FuzzyQuery, but the default is limited to
BooleanQuery.maxClauseCount() as before.
(Uwe Schindler, Robert Muir, Mike McCandless)
* LUCENE-2135: On IndexReader.close, forcefully evict any entries from
the FieldCache rather than waiting for the WeakHashMap to release

View File

@ -30,7 +30,7 @@ import java.io.IOException;
* length of 0 - in this case, *every* term will be enumerated and
* cause an edit score calculation.
*
* <p>This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}
* <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
* as default. So terms will be collected and scored according to their
* edit distance. Only the top terms are used for building the {@link BooleanQuery}.
* It is not recommended to change the rewrite mode for fuzzy queries.
@ -39,6 +39,7 @@ public class FuzzyQuery extends MultiTermQuery {
public final static float defaultMinSimilarity = 0.5f;
public final static int defaultPrefixLength = 0;
public final static int defaultMaxExpansions = Integer.MAX_VALUE;
private float minimumSimilarity;
private int prefixLength;
@ -59,12 +60,15 @@ public class FuzzyQuery extends MultiTermQuery {
* as the query term is considered similar to the query term if the edit distance
* between both terms is less than <code>length(term)*0.5</code>
* @param prefixLength length of common (non-fuzzy) prefix
* @param maxExpansions the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
* then the maxClauseCount will be used instead.
* @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
* or if prefixLength &lt; 0
*/
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
int maxExpansions) {
this.term = term;
setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
if (minimumSimilarity >= 1.0f)
throw new IllegalArgumentException("minimumSimilarity >= 1");
@ -72,6 +76,10 @@ public class FuzzyQuery extends MultiTermQuery {
throw new IllegalArgumentException("minimumSimilarity < 0");
if (prefixLength < 0)
throw new IllegalArgumentException("prefixLength < 0");
if (maxExpansions < 0)
throw new IllegalArgumentException("maxExpansions < 0");
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
this.termLongEnough = true;
@ -82,17 +90,24 @@ public class FuzzyQuery extends MultiTermQuery {
}
/**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}.
*/
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
this(term, minimumSimilarity, defaultPrefixLength);
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}.
*/
public FuzzyQuery(Term term, float minimumSimilarity) {
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}.
*/
public FuzzyQuery(Term term) {
this(term, defaultMinSimilarity, defaultPrefixLength);
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
}
/**

View File

@ -20,7 +20,6 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
@ -53,7 +52,7 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc
* computing unhelpful scores, and it tries to pick the most
* performant rewrite method given the query. If you
* need scoring (like {@link FuzzyQuery}, use
* {@link #TOP_TERMS_SCORING_BOOLEAN_REWRITE} which uses
* {@link TopTermsScoringBooleanQueryRewrite} which uses
* a priority queue to only collect competitive terms
* and not hit this limitation.
*
@ -163,10 +162,41 @@ public abstract class MultiTermQuery extends Query {
* @see #setRewriteMethod */
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
/** A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a
* BooleanQuery, and keeps the scores as computed by the
* query.
*
* <p>This rewrite mode only uses the top scoring terms
* so it will not overflow the boolean max clause count.
* It is the default rewrite mode for {@link FuzzyQuery}.
*
* @see #setRewriteMethod */
public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
private final int size;
/**
* Create a TopTermsScoringBooleanQueryRewrite for
* at most <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public TopTermsScoringBooleanQueryRewrite(int size) {
this.size = size;
}
/**
* Create a TopTermsScoringBooleanQueryRewrite that is limited
* to at most {@link BooleanQuery#getMaxClauseCount} terms.
*/
public TopTermsScoringBooleanQueryRewrite() {
this(Integer.MAX_VALUE);
}
@Override
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
final int maxSize = BooleanQuery.getMaxClauseCount();
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
collectTerms(reader, query, new TermCollector() {
public boolean collect(Term t, float boost) {
@ -195,10 +225,23 @@ public abstract class MultiTermQuery extends Query {
query.incTotalNumberOfTerms(bq.clauses().size());
return bq;
}
@Override
public int hashCode() {
final int prime = 17;
int result = 1;
result = prime * result + size;
return result;
}
// Make sure we are still a singleton even after deserializing
protected Object readResolve() {
return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
if (size != other.size) return false;
return true;
}
private static class ScoreTerm implements Comparable<ScoreTerm> {
@ -213,18 +256,6 @@ public abstract class MultiTermQuery extends Query {
}
}
}
/** A rewrite method that first translates each term into
* {@link BooleanClause.Occur#SHOULD} clause in a
* BooleanQuery, and keeps the scores as computed by the
* query.
*
* <p>This rewrite mode only uses the top scoring terms
* so it will not overflow the boolean max clause count.
* It is the default rewrite mode for {@link FuzzyQuery}.
*
* @see #setRewriteMethod */
public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
@Override

View File

@ -89,22 +89,16 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(order.get(i), term);
}
// test BooleanQuery.maxClauseCount
int savedClauseCount = BooleanQuery.getMaxClauseCount();
try {
BooleanQuery.setMaxClauseCount(2);
// This query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
} finally {
BooleanQuery.setMaxClauseCount(savedClauseCount);
// test pq size by supplying maxExpansions=2
// This query would normally return 3 documents, because 3 terms match (see above):
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb");
for (int i = 0; i < hits.length; i++) {
final String term = searcher.doc(hits[i].doc).get("field");
//System.out.println(hits[i].score);
assertEquals(order.get(i), term);
}
// not similar enough:

View File

@ -111,7 +111,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
}
private void checkBooleanTerms(Searcher searcher, TermRangeQuery query, String... terms) throws IOException {
query.setRewriteMethod(MultiTermQuery.TOP_TERMS_SCORING_BOOLEAN_REWRITE);
query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite());
final BooleanQuery bq = (BooleanQuery) searcher.rewrite(query);
final Set<String> allowedTerms = new HashSet<String>(Arrays.asList(terms));
assertEquals(allowedTerms.size(), bq.clauses().size());