LUCENE-2261: make TopTermsScoringBooleanRewrite's PQ size configurable

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@912311 13f79535-47bb-0310-9956-ffa450edef68
2010-02-21 07:27:46 +00:00 · 2010-02-21 07:27:46 +00:00 · 7b06e1c5db
parent 349ca0943e
commit 7b06e1c5db
5 changed files with 89 additions and 46 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -135,8 +135,11 @@ Optimizations
 * LUCENE-2137: Switch to AtomicInteger for some ref counting (Earwin
  Burrfoot via Mike McCandless)

-* LUCENE-2123: Move FuzzyQuery rewrite as separate RewriteMode into
-  MTQ. (Uwe Schindler, Robert Muir, Mike McCandless)
+* LUCENE-2123, LUCENE-2261: Move FuzzyQuery rewrite to separate RewriteMode 
+  into MultiTermQuery. The number of fuzzy expansions can be specified with
+  the maxExpansions parameter to FuzzyQuery, but the default is limited to
+  BooleanQuery.maxClauseCount() as before. 
+  (Uwe Schindler, Robert Muir, Mike McCandless)

 * LUCENE-2135: On IndexReader.close, forcefully evict any entries from
  the FieldCache rather than waiting for the WeakHashMap to release
--- a/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/src/java/org/apache/lucene/search/FuzzyQuery.java
@ -30,7 +30,7 @@ import java.io.IOException;
 * length of 0 - in this case, *every* term will be enumerated and
 * cause an edit score calculation.
 * 
- * <p>This query uses {@link MultiTermQuery#TOP_TERMS_SCORING_BOOLEAN_REWRITE}
+ * <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
 * as default. So terms will be collected and scored according to their
 * edit distance. Only the top terms are used for building the {@link BooleanQuery}.
 * It is not recommended to change the rewrite mode for fuzzy queries.
@ -39,6 +39,7 @@ public class FuzzyQuery extends MultiTermQuery {
  
  public final static float defaultMinSimilarity = 0.5f;
  public final static int defaultPrefixLength = 0;
+  public final static int defaultMaxExpansions = Integer.MAX_VALUE;
  
  private float minimumSimilarity;
  private int prefixLength;
@ -59,12 +60,15 @@ public class FuzzyQuery extends MultiTermQuery {
   *  as the query term is considered similar to the query term if the edit distance
   *  between both terms is less than <code>length(term)*0.5</code>
   * @param prefixLength length of common (non-fuzzy) prefix
+   * @param maxExpansions the maximum number of terms to match. If this number is
+   *  greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, 
+   *  then the maxClauseCount will be used instead.
   * @throws IllegalArgumentException if minimumSimilarity is &gt;= 1 or &lt; 0
   * or if prefixLength &lt; 0
   */
-  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException {
+  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+      int maxExpansions) {
    this.term = term;
-    setRewriteMethod(TOP_TERMS_SCORING_BOOLEAN_REWRITE);
    
    if (minimumSimilarity >= 1.0f)
      throw new IllegalArgumentException("minimumSimilarity >= 1");
@ -72,6 +76,10 @@ public class FuzzyQuery extends MultiTermQuery {
      throw new IllegalArgumentException("minimumSimilarity < 0");
    if (prefixLength < 0)
      throw new IllegalArgumentException("prefixLength < 0");
+    if (maxExpansions < 0)
+      throw new IllegalArgumentException("maxExpansions < 0");
+    
+    setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
    
    if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) {
      this.termLongEnough = true;
@ -82,17 +90,24 @@ public class FuzzyQuery extends MultiTermQuery {
  }
  
  /**
-   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}.
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, Integer.MAX_VALUE)}.
   */
-  public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException {
-    this(term, minimumSimilarity, defaultPrefixLength);
+  public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+    this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
+  }
+  
+  /**
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, Integer.MAX_VALUE)}.
+   */
+  public FuzzyQuery(Term term, float minimumSimilarity) {
+    this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
  }

  /**
-   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}.
+   * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0, Integer.MAX_VALUE)}.
   */
  public FuzzyQuery(Term term) {
-    this(term, defaultMinSimilarity, defaultPrefixLength);
+    this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
  }
  
  /**
--- a/src/java/org/apache/lucene/search/MultiTermQuery.java
+++ b/src/java/org/apache/lucene/search/MultiTermQuery.java
@ -20,7 +20,6 @@ package org.apache.lucene.search;
 import java.io.IOException;
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.PriorityQueue;

 import org.apache.lucene.index.IndexReader;
@ -53,7 +52,7 @@ import org.apache.lucene.queryParser.QueryParser; // for javadoc
 * computing unhelpful scores, and it tries to pick the most
 * performant rewrite method given the query. If you
 * need scoring (like {@link FuzzyQuery}, use
- * {@link #TOP_TERMS_SCORING_BOOLEAN_REWRITE} which uses
+ * {@link TopTermsScoringBooleanQueryRewrite} which uses
 * a priority queue to only collect competitive terms
 * and not hit this limitation.
 *
@ -163,10 +162,41 @@ public abstract class MultiTermQuery extends Query {
   *  @see #setRewriteMethod */
  public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite();

-  private static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+  /** A rewrite method that first translates each term into
+   *  {@link BooleanClause.Occur#SHOULD} clause in a
+   *  BooleanQuery, and keeps the scores as computed by the
+   *  query.
+   *
+   * <p>This rewrite mode only uses the top scoring terms
+   * so it will not overflow the boolean max clause count.
+   * It is the default rewrite mode for {@link FuzzyQuery}.
+   *
+   *  @see #setRewriteMethod */
+  public static final class TopTermsScoringBooleanQueryRewrite extends BooleanQueryRewrite {
+    private final int size;
+    
+    /** 
+     * Create a TopTermsScoringBooleanQueryRewrite for 
+     * at most <code>size</code> terms.
+     * <p>
+     * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than 
+     * <code>size</code>, then it will be used instead. 
+     */
+    public TopTermsScoringBooleanQueryRewrite(int size) {
+      this.size = size;
+    }
+    
+    /** 
+     * Create a TopTermsScoringBooleanQueryRewrite that is limited
+     * to at most {@link BooleanQuery#getMaxClauseCount} terms. 
+     */
+    public TopTermsScoringBooleanQueryRewrite() {
+      this(Integer.MAX_VALUE);
+    }
+    
    @Override
    public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException {
-      final int maxSize = BooleanQuery.getMaxClauseCount();
+      final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount());
      final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>();
      collectTerms(reader, query, new TermCollector() {
        public boolean collect(Term t, float boost) {
@ -195,10 +225,23 @@ public abstract class MultiTermQuery extends Query {
      query.incTotalNumberOfTerms(bq.clauses().size());
      return bq;
    }
+  
+    @Override
+    public int hashCode() {
+      final int prime = 17;
+      int result = 1;
+      result = prime * result + size;
+      return result;
+    }

-    // Make sure we are still a singleton even after deserializing
-    protected Object readResolve() {
-      return TOP_TERMS_SCORING_BOOLEAN_REWRITE;
+    @Override
+    public boolean equals(Object obj) {
+      if (this == obj) return true;
+      if (obj == null) return false;
+      if (getClass() != obj.getClass()) return false;
+      TopTermsScoringBooleanQueryRewrite other = (TopTermsScoringBooleanQueryRewrite) obj;
+      if (size != other.size) return false;
+      return true;
    }
  
    private static class ScoreTerm implements Comparable<ScoreTerm> {
@ -213,18 +256,6 @@ public abstract class MultiTermQuery extends Query {
      }
    }
  }
-  
-  /** A rewrite method that first translates each term into
-   *  {@link BooleanClause.Occur#SHOULD} clause in a
-   *  BooleanQuery, and keeps the scores as computed by the
-   *  query.
-   *
-   * <p>This rewrite mode only uses the top scoring terms
-   * so it will not overflow the boolean max clause count.
-   * It is the default rewrite mode for {@link FuzzyQuery}.
-   *
-   *  @see #setRewriteMethod */
-  public final static RewriteMethod TOP_TERMS_SCORING_BOOLEAN_REWRITE = new TopTermsScoringBooleanQueryRewrite();

  private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable {
    @Override
--- a/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@ -89,22 +89,16 @@ public class TestFuzzyQuery extends LuceneTestCase {
      assertEquals(order.get(i), term);
    }

-    // test BooleanQuery.maxClauseCount
-    int savedClauseCount = BooleanQuery.getMaxClauseCount();
-    try {
-      BooleanQuery.setMaxClauseCount(2);
-      // This query would normally return 3 documents, because 3 terms match (see above):
-      query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);   
-      hits = searcher.search(query, null, 1000).scoreDocs;
-      assertEquals("only 2 documents should match", 2, hits.length);
-      order = Arrays.asList("bbbbb","abbbb");
-      for (int i = 0; i < hits.length; i++) {
-        final String term = searcher.doc(hits[i].doc).get("field");
-        //System.out.println(hits[i].score);
-        assertEquals(order.get(i), term);
-      }
-    } finally {
-      BooleanQuery.setMaxClauseCount(savedClauseCount);
+    // test pq size by supplying maxExpansions=2
+    // This query would normally return 3 documents, because 3 terms match (see above):
+    query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2); 
+    hits = searcher.search(query, null, 1000).scoreDocs;
+    assertEquals("only 2 documents should match", 2, hits.length);
+    order = Arrays.asList("bbbbb","abbbb");
+    for (int i = 0; i < hits.length; i++) {
+      final String term = searcher.doc(hits[i].doc).get("field");
+      //System.out.println(hits[i].score);
+      assertEquals(order.get(i), term);
    }

    // not similar enough:
--- a/src/test/org/apache/lucene/search/TestTermRangeQuery.java
+++ b/src/test/org/apache/lucene/search/TestTermRangeQuery.java
@ -111,7 +111,7 @@ public class TestTermRangeQuery extends LuceneTestCase {
  }
  
  private void checkBooleanTerms(Searcher searcher, TermRangeQuery query, String... terms) throws IOException {
-    query.setRewriteMethod(MultiTermQuery.TOP_TERMS_SCORING_BOOLEAN_REWRITE);
+    query.setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite());
    final BooleanQuery bq = (BooleanQuery) searcher.rewrite(query);
    final Set<String> allowedTerms = new HashSet<String>(Arrays.asList(terms));
    assertEquals(allowedTerms.size(), bq.clauses().size());