mirror of https://github.com/apache/lucene.git
LUCENE-2261: make TopTermsScoringBooleanRewrite's PQ size configurable
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@912311 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
349ca0943e
commit
7b06e1c5db
|
@ -135,8 +135,11 @@ Optimizations
|
|||
* LUCENE-2137: Switch to AtomicInteger for some ref counting (Earwin
|
||||
Burrfoot via Mike McCandless)
|
||||
|
||||
* LUCENE-2123: Move FuzzyQuery rewrite as separate RewriteMode into
|
||||
MTQ. (Uwe Schindler, Robert Muir, Mike McCandless)
|
||||
* LUCENE-2123, LUCENE-2261: Move FuzzyQuery rewrite to separate RewriteMode
|
||||
into MultiTermQuery. The number of fuzzy expansions can be specified with
|
||||
the maxExpansions parameter to FuzzyQuery, but the default is limited to
|
||||
BooleanQuery.maxClauseCount() as before.
|
||||
(Uwe Schindler, Robert Muir, Mike McCandless)
|
||||
|
||||
* LUCENE-2135: On IndexReader.close, forcefully evict any entries from
|
||||
the FieldCache rather than waiting for the WeakHashMap to release
|
||||
|
|
|
@ -30,7 +30,7 @@ import java.io.IOException;
|
|||
* length of 0 - in this case, *every* term will be enumerated and
|
||||
* cause an edit score calculation.
|
||||
*
|
||||
* <p>This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}
|
||||
* <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
|
||||
* as default. So terms will be collected and scored according to their
|
||||
* edit distance. Only the top terms are used for building the {@link BooleanQuery}.
|
||||
* It is not recommended to change the rewrite mode for fuzzy queries.
|
||||
|
@ -39,6 +39,7 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
|
||||
public final static float defaultMinSimilarity = 0.5f;
|
||||
public final static int defaultPrefixLength = 0;
|
||||
public final static int defaultMaxExpansions = Integer.MAX_VALUE;
|
||||
|
||||
private float minimumSimilarity;
|
||||
private int prefixLength;
|
||||
|
@ -59,12 +60,15 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
* as the query term is considered similar to the query term if the edit distance
|
||||
* between both terms is less than <code>length(term)*0.5</code>
|
||||
* @param prefixLength length of common (non-fuzzy) prefix
|
||||
* @param maxExpansions the maximum number of terms to match. If this number is
|
||||
* greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
|
||||
* then the maxClauseCount will be used instead.
|
||||
* @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
|
||||
* or if prefixLength < 0
|
||||
*/
|
||||
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
|
||||
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
|
||||
int maxExpansions) {
|
||||
this.term = term;
|
||||
setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
|
||||
|
||||
if (minimumSimilarity >= 1.0f)
|
||||
throw new IllegalArgumentException("minimumSimilarity >= 1");
|
||||
|
@ -72,6 +76,10 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
throw new IllegalArgumentException("minimumSimilarity < 0");
|
||||
if (prefixLength < 0)
|
||||
throw new IllegalArgumentException("prefixLength < 0");
|
||||
if (maxExpansions < 0)
|
||||
throw new IllegalArgumentException("maxExpansions < 0");
|
||||
|
||||
setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
|
||||
|
||||
if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
|
||||
this.termLongEnough = true;
|
||||
|
@ -82,17 +90,24 @@ public class FuzzyQuery extends MultiTermQuery {
|
|||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}.
|
||||
*/
|
||||
public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
|
||||
this(term, minimumSimilarity, defaultPrefixLength);
|
||||
public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
|
||||
this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}.
|
||||
*/
|
||||
public FuzzyQuery(Term term, float minimumSimilarity) {
|
||||
this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
|
||||
* Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}.
|
||||
*/
|
||||
public FuzzyQuery(Term term) {
|
||||
this(term, defaultMinSimilarity, defaultPrefixLength);
|
||||
this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.search;
|
|||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -53,7 +52,7 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc
|
|||
* computing unhelpful scores, and it tries to pick the most
|
||||
* performant rewrite method given the query. If you
|
||||
* need scoring (like {@link FuzzyQuery}, use
|
||||
* {@link #TOP_TERMS_SCORING_BOOLEAN_REWRITE} which uses
|
||||
* {@link TopTermsScoringBooleanQueryRewrite} which uses
|
||||
* a priority queue to only collect competitive terms
|
||||
* and not hit this limitation.
|
||||
*
|
||||
|
@ -163,10 +162,41 @@ public abstract class MultiTermQuery extends Query {
|
|||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();
|
||||
|
||||
private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
|
||||
/** A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a
|
||||
* BooleanQuery, and keeps the scores as computed by the
|
||||
* query.
|
||||
*
|
||||
* <p>This rewrite mode only uses the top scoring terms
|
||||
* so it will not overflow the boolean max clause count.
|
||||
* It is the default rewrite mode for {@link FuzzyQuery}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
|
||||
private final int size;
|
||||
|
||||
/**
|
||||
* Create a TopTermsScoringBooleanQueryRewrite for
|
||||
* at most <code>size</code> terms.
|
||||
* <p>
|
||||
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
|
||||
* <code>size</code>, then it will be used instead.
|
||||
*/
|
||||
public TopTermsScoringBooleanQueryRewrite(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a TopTermsScoringBooleanQueryRewrite that is limited
|
||||
* to at most {@link BooleanQuery#getMaxClauseCount} terms.
|
||||
*/
|
||||
public TopTermsScoringBooleanQueryRewrite() {
|
||||
this(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
|
||||
final int maxSize = BooleanQuery.getMaxClauseCount();
|
||||
final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
|
||||
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
|
||||
collectTerms(reader, query, new TermCollector() {
|
||||
public boolean collect(Term t, float boost) {
|
||||
|
@ -195,10 +225,23 @@ public abstract class MultiTermQuery extends Query {
|
|||
query.incTotalNumberOfTerms(bq.clauses().size());
|
||||
return bq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 17;
|
||||
int result = 1;
|
||||
result = prime * result + size;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Make sure we are still a singleton even after deserializing
|
||||
protected Object readResolve() {
|
||||
return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
|
||||
if (size != other.size) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static class ScoreTerm implements Comparable<ScoreTerm> {
|
||||
|
@ -213,18 +256,6 @@ public abstract class MultiTermQuery extends Query {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** A rewrite method that first translates each term into
|
||||
* {@link BooleanClause.Occur#SHOULD} clause in a
|
||||
* BooleanQuery, and keeps the scores as computed by the
|
||||
* query.
|
||||
*
|
||||
* <p>This rewrite mode only uses the top scoring terms
|
||||
* so it will not overflow the boolean max clause count.
|
||||
* It is the default rewrite mode for {@link FuzzyQuery}.
|
||||
*
|
||||
* @see #setRewriteMethod */
|
||||
public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();
|
||||
|
||||
private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
|
||||
@Override
|
||||
|
|
|
@ -89,22 +89,16 @@ public class TestFuzzyQuery extends LuceneTestCase {
|
|||
assertEquals(order.get(i), term);
|
||||
}
|
||||
|
||||
// test BooleanQuery.maxClauseCount
|
||||
int savedClauseCount = BooleanQuery.getMaxClauseCount();
|
||||
try {
|
||||
BooleanQuery.setMaxClauseCount(2);
|
||||
// This query would normally return 3 documents, because 3 terms match (see above):
|
||||
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("only 2 documents should match", 2, hits.length);
|
||||
order = Arrays.asList("bbbbb","abbbb");
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
final String term = searcher.doc(hits[i].doc).get("field");
|
||||
//System.out.println(hits[i].score);
|
||||
assertEquals(order.get(i), term);
|
||||
}
|
||||
} finally {
|
||||
BooleanQuery.setMaxClauseCount(savedClauseCount);
|
||||
// test pq size by supplying maxExpansions=2
|
||||
// This query would normally return 3 documents, because 3 terms match (see above):
|
||||
query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
|
||||
hits = searcher.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("only 2 documents should match", 2, hits.length);
|
||||
order = Arrays.asList("bbbbb","abbbb");
|
||||
for (int i = 0; i < hits.length; i++) {
|
||||
final String term = searcher.doc(hits[i].doc).get("field");
|
||||
//System.out.println(hits[i].score);
|
||||
assertEquals(order.get(i), term);
|
||||
}
|
||||
|
||||
// not similar enough:
|
||||
|
|
|
@ -111,7 +111,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
|
|||
}
|
||||
|
||||
private void checkBooleanTerms(Searcher searcher, TermRangeQuery query, String... terms) throws IOException {
|
||||
query.setRewriteMethod(MultiTermQuery.TOP_TERMS_SCORING_BOOLEAN_REWRITE);
|
||||
query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite());
|
||||
final BooleanQuery bq = (BooleanQuery) searcher.rewrite(query);
|
||||
final Set<String> allowedTerms = new HashSet<String>(Arrays.asList(terms));
|
||||
assertEquals(allowedTerms.size(), bq.clauses().size());
|
||||
|
|
Loading…
Reference in New Issue