LUCENE-5979: Use the cost API to decide on whether to use random-access to intersect queries and filters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1629598 13f79535-47bb-0310-9956-ffa450edef68
2025-02-15 22:45:52 +00:00 · 2014-10-06 09:19:59 +00:00 · 2014-10-06 09:19:59 +00:00 · 1a834a153a
commit 1a834a153a
parent 74ca91be6c
4 changed files with 18 additions and 41 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -1250,6 +1250,10 @@ New Features
  approximate value of the diameter of the earth at the given latitude.
  (Adrien Grand)

+* LUCENE-5979: FilteredQuery uses the cost API to decide on whether to use
+  random-access or leap-frog to intersect the filter with the query.
+  (Adrien Grand)
+
 Build

 * LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
--- a/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FilteredQuery.java
@ -254,12 +254,12 @@ public class FilteredQuery extends Query {
   * jumping past the target document. When both land on the same document, it's
   * collected.
   */
-  private static class LeapFrogScorer extends Scorer {
+  private static final class LeapFrogScorer extends Scorer {
    private final DocIdSetIterator secondary;
    private final DocIdSetIterator primary;
    private final Scorer scorer;
-    protected int primaryDoc = -1;
-    protected int secondaryDoc = -1;
+    private int primaryDoc = -1;
+    private int secondaryDoc = -1;

    protected LeapFrogScorer(Weight weight, DocIdSetIterator primary, DocIdSetIterator secondary, Scorer scorer) {
      super(weight);
@ -324,26 +324,6 @@ public class FilteredQuery extends Query {
    }
  }
  
-  // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer
-  private static final class PrimaryAdvancedLeapFrogScorer extends LeapFrogScorer {
-    private final int firstFilteredDoc;
-
-    protected PrimaryAdvancedLeapFrogScorer(Weight weight, int firstFilteredDoc, DocIdSetIterator filterIter, Scorer other) {
-      super(weight, filterIter, other, other);
-      this.firstFilteredDoc = firstFilteredDoc;
-      this.primaryDoc = firstFilteredDoc; // initialize to prevent and advance call to move it further
-    }
-
-    @Override
-    protected int primaryNext() throws IOException {
-      if (secondaryDoc != -1) {
-        return super.primaryNext();
-      } else {
-        return firstFilteredDoc;
-      }
-    }
-  }
-  
  /** Rewrites the query. If the wrapped is an instance of
   * {@link MatchAllDocsQuery} it returns a {@link ConstantScoreQuery}. Otherwise
   * it returns a new {@code FilteredQuery} wrapping the rewritten query. */
@ -421,7 +401,7 @@ public class FilteredQuery extends Query {
   * A {@link FilterStrategy} that conditionally uses a random access filter if
   * the given {@link DocIdSet} supports random access (returns a non-null value
   * from {@link DocIdSet#bits()}) and
-   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns
+   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, long)} returns
   * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" (
   * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy.
   * 
@ -515,7 +495,7 @@ public class FilteredQuery extends Query {
   * A {@link FilterStrategy} that conditionally uses a random access filter if
   * the given {@link DocIdSet} supports random access (returns a non-null value
   * from {@link DocIdSet#bits()}) and
-   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns
+   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, long)} returns
   * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" (
   * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy .
   */
@ -528,25 +508,18 @@ public class FilteredQuery extends Query {
        // this means the filter does not accept any documents.
        return null;
      }  
-
-      final int firstFilterDoc = filterIter.nextDoc();
-      if (firstFilterDoc == DocIdSetIterator.NO_MORE_DOCS) {
-        return null;
-      }
      
      final Bits filterAcceptDocs = docIdSet.bits();
      // force if RA is requested
-      final boolean useRandomAccess = filterAcceptDocs != null && useRandomAccess(filterAcceptDocs, firstFilterDoc);
+      final boolean useRandomAccess = filterAcceptDocs != null && useRandomAccess(filterAcceptDocs, filterIter.cost());
      if (useRandomAccess) {
        // if we are using random access, we return the inner scorer, just with other acceptDocs
        return weight.scorer(context, filterAcceptDocs);
      } else {
-        assert firstFilterDoc > -1;
        // we are gonna advance() this scorer, so we set inorder=true/toplevel=false
        // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice
        final Scorer scorer = weight.scorer(context, null);
-        // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer
-        return (scorer == null) ? null : new PrimaryAdvancedLeapFrogScorer(weight, firstFilterDoc, filterIter, scorer);
+        return (scorer == null) ? null : new LeapFrogScorer(weight, filterIter, scorer, scorer);
      }
    }
    
@ -557,14 +530,14 @@ public class FilteredQuery extends Query {
     * However, when the filter is very sparse, it can be faster to execute the query+filter
     * as a conjunction in some cases.
     * 
-     * The default implementation returns <code>true</code> if the first document accepted by the
-     * filter is < 100.
+     * The default implementation returns <code>true</code> if the filter matches more than 1%
+     * of documents
     * 
     * @lucene.internal
     */
-    protected boolean useRandomAccess(Bits bits, int firstFilterDoc) {
-      //TODO once we have a cost API on filters and scorers we should rethink this heuristic
-      return firstFilterDoc < 100;
+    protected boolean useRandomAccess(Bits bits, long filterCost) {
+      // if the filter matches more than 1% of documents, we use random-access
+      return filterCost * 100 > bits.length();
    }
  }
  
--- a/lucene/core/src/test/org/apache/lucene/search/TestFilteredQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFilteredQuery.java
@ -387,7 +387,7 @@ public class TestFilteredQuery extends LuceneTestCase {
    if (useRandomAccess) {
      return new FilteredQuery.RandomAccessFilterStrategy() {
        @Override
-        protected boolean useRandomAccess(Bits bits, int firstFilterDoc) {
+        protected boolean useRandomAccess(Bits bits, long filterCost) {
          return true;
        }
      };
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
@ -1025,7 +1025,7 @@ public final class TestUtil {
      case 4:
        return new FilteredQuery.RandomAccessFilterStrategy() {
          @Override
-          protected boolean useRandomAccess(Bits bits, int firstFilterDoc) {
+          protected boolean useRandomAccess(Bits bits, long filterCost) {
            return LuceneTestCase.random().nextBoolean();
          }
        };