Add #scoreSupplier support to DocValuesRewriteMethod along with singleton doc value opto (#1020)

2022-07-28 11:12:21 -07:00 · 2022-07-28 11:12:21 -07:00 · 4ebc249dbc
parent bb752c774c
commit 4ebc249dbc
2 changed files with 94 additions and 40 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -100,6 +100,8 @@ Optimizations

 * LUCENE-10661: Reduce memory copy in BytesStore. (luyuncheng)

+* GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)
+
 Bug Fixes
 ---------------------
 * LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)
--- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java
+++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRewriteMethod.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
@ -83,21 +84,25 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {

        @Override
        public Matches matches(LeafReaderContext context, int doc) throws IOException {
-          final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
+          final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), query.field);
          return MatchesUtils.forField(
              query.field,
              () ->
                  DisjunctionMatchesIterator.fromTermsEnum(
-                      context, doc, query, query.field, getTermsEnum(fcsi)));
+                      context, doc, query, query.field, getTermsEnum(values)));
        }

-        private TermsEnum getTermsEnum(SortedSetDocValues fcsi) throws IOException {
+        /**
+         * Create a TermsEnum that provides the intersection of the query terms with the terms
+         * present in the doc values.
+         */
+        private TermsEnum getTermsEnum(SortedSetDocValues values) throws IOException {
          return query.getTermsEnum(
              new Terms() {

                @Override
                public TermsEnum iterator() throws IOException {
-                  return fcsi.termsEnum();
+                  return values.termsEnum();
                }

                @Override
@ -142,46 +147,93 @@ public final class DocValuesRewriteMethod extends MultiTermQuery.RewriteMethod {
              });
        }

+        @Override
+        public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
+          final SortedSetDocValues values = DocValues.getSortedSet(context.reader(), query.field);
+          if (values.getValueCount() == 0) {
+            return null; // no values/docs so nothing can match
+          }
+
+          final Weight weight = this;
+          return new ScorerSupplier() {
+            @Override
+            public Scorer get(long leadCost) throws IOException {
+              // Create a TermsEnum that will provide the intersection of the terms specified in the
+              // query with the values present in the doc values:
+              TermsEnum termsEnum = getTermsEnum(values);
+              assert termsEnum != null;
+
+              if (termsEnum.next() == null) {
+                // no matching terms
+                return new ConstantScoreScorer(
+                    weight, score(), scoreMode, DocIdSetIterator.empty());
+              }
+
+              // Create a bit set for the "term set" ordinals (these are the terms provided by the
+              // query that are actually present in the doc values field). Cannot use FixedBitSet
+              // because we require long index (ord):
+              final LongBitSet termSet = new LongBitSet(values.getValueCount());
+              do {
+                long ord = termsEnum.ord();
+                if (ord >= 0) {
+                  termSet.set(ord);
+                }
+              } while (termsEnum.next() != null);
+
+              final SortedDocValues singleton = DocValues.unwrapSingleton(values);
+              final TwoPhaseIterator iterator;
+              if (singleton != null) {
+                iterator =
+                    new TwoPhaseIterator(singleton) {
+                      @Override
+                      public boolean matches() throws IOException {
+                        return termSet.get(singleton.ordValue());
+                      }
+
+                      @Override
+                      public float matchCost() {
+                        return 3; // lookup in a bitset
+                      }
+                    };
+              } else {
+                iterator =
+                    new TwoPhaseIterator(values) {
+                      @Override
+                      public boolean matches() throws IOException {
+                        for (int i = 0; i < values.docValueCount(); i++) {
+                          if (termSet.get(values.nextOrd())) {
+                            return true;
+                          }
+                        }
+                        return false;
+                      }
+
+                      @Override
+                      public float matchCost() {
+                        return 3; // lookup in a bitset
+                      }
+                    };
+              }
+
+              return new ConstantScoreScorer(weight, score(), scoreMode, iterator);
+            }
+
+            @Override
+            public long cost() {
+              // We have no prior knowledge of how many docs might match for any given query term,
+              // so we assume that all docs with a value could be a match:
+              return values.cost();
+            }
+          };
+        }
+
        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
-          final SortedSetDocValues fcsi = DocValues.getSortedSet(context.reader(), query.field);
-          TermsEnum termsEnum = getTermsEnum(fcsi);
-          assert termsEnum != null;
-          if (termsEnum.next() == null) {
-            // no matching terms
+          final ScorerSupplier scorerSupplier = scorerSupplier(context);
+          if (scorerSupplier == null) {
            return null;
          }
-          // fill into a bitset
-          // Cannot use FixedBitSet because we require long index (ord):
-          final LongBitSet termSet = new LongBitSet(fcsi.getValueCount());
-          do {
-            long ord = termsEnum.ord();
-            if (ord >= 0) {
-              termSet.set(ord);
-            }
-          } while (termsEnum.next() != null);
-
-          return new ConstantScoreScorer(
-              this,
-              score(),
-              scoreMode,
-              new TwoPhaseIterator(fcsi) {
-
-                @Override
-                public boolean matches() throws IOException {
-                  for (int i = 0; i < fcsi.docValueCount(); i++) {
-                    if (termSet.get(fcsi.nextOrd())) {
-                      return true;
-                    }
-                  }
-                  return false;
-                }
-
-                @Override
-                public float matchCost() {
-                  return 3; // lookup in a bitset
-                }
-              });
+          return scorerSupplier.get(Long.MAX_VALUE);
        }

        @Override