LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables (#736)

2025-02-09 03:25:15 +00:00 · 2022-03-23 22:54:52 +08:00 · 2022-03-23 22:54:52 +08:00 · 226880dd33
commit 226880dd33
parent 691760be93
3 changed files with 158 additions and 37 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -11,6 +11,10 @@ API Changes

 New Features
 ---------------------
+
+* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
+  to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
+
 * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` 
 implementation. `Monitor` can be created with a readonly `QueryIndex` in order to 
 have readonly `Monitor` instances. (Niko Usai)
@ -21,6 +25,7 @@ Improvements

 Optimizations
 ---------------------
+
 * LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov)

 * LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov)
@ -150,9 +155,6 @@ New Features
  based on TotalHitCountCollector that allows users to parallelize counting the
  number of hits. (Luca Cavanna, Adrien Grand)

-* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
-  to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand)
-
 * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)

 * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@ -20,8 +20,10 @@ import java.io.IOException;
 import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@ -198,16 +200,18 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {

      @Override
      public int count(LeafReaderContext context) throws IOException {
-        BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context);
-        if (disi != null) {
-          return disi.lastDoc - disi.firstDoc;
+        if (context.reader().hasDeletions() == false) {
+          BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context);
+          if (disi != null && disi.delegate == null) {
+            return disi.lastDoc - disi.firstDoc;
+          }
        }
        return fallbackWeight.count(context);
      }
    };
  }

-  private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
+  private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
      throws IOException {
    SortedNumericDocValues sortedNumericValues =
        DocValues.getSortedNumeric(context.reader(), field);
@ -237,7 +241,7 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
   * {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with
   * no value.
   */
-  private BoundedDocSetIdIterator getDocIdSetIterator(
+  private BoundedDocIdSetIterator getDocIdSetIterator(
      SortField sortField, LeafReaderContext context, DocIdSetIterator delegate)
      throws IOException {
    long lower = sortField.getReverse() ? upperValue : lowerValue;
@ -278,7 +282,19 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
    }

    int lastDocIdExclusive = high + 1;
-    return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
+    Object missingValue = sortField.getMissingValue();
+    BoundedDocIdSetIterator disi;
+    LeafReader reader = context.reader();
+    PointValues pointValues = reader.getPointValues(field);
+    final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
+    // all documents have docValues or missing value falls outside the range
+    if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
+        || (missingLongValue < lowerValue || missingLongValue > upperValue)) {
+      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null);
+    } else {
+      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
+    }
+    return disi;
  }

  /** Compares the given document's value with a stored reference value. */
@ -306,14 +322,14 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
   * A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range
   * [firstDocInclusive, lastDoc).
   */
-  private static class BoundedDocSetIdIterator extends DocIdSetIterator {
+  private static class BoundedDocIdSetIterator extends DocIdSetIterator {
    private final int firstDoc;
    private final int lastDoc;
    private final DocIdSetIterator delegate;

    private int docID = -1;

-    BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
+    BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
      this.firstDoc = firstDoc;
      this.lastDoc = lastDoc;
      this.delegate = delegate;
@ -335,7 +351,12 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
        target = firstDoc;
      }

-      int result = delegate.advance(target);
+      int result;
+      if (delegate != null) {
+        result = delegate.advance(target);
+      } else {
+        result = target;
+      }
      if (result < lastDoc) {
        docID = result;
      } else {
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@ -20,9 +20,11 @@ import static org.hamcrest.CoreMatchers.instanceOf;

 import java.io.IOException;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
@ -59,7 +61,14 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
      IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
      boolean reverse = random().nextBoolean();
      SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse);
-      sortField.setMissingValue(random().nextLong());
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? TestUtil.nextLong(random(), -100, 10000)
+                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
+        sortField.setMissingValue(missingValue);
+      }
      iwc.setIndexSort(new Sort(sortField));

      RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -459,30 +468,6 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    reader.close();
  }

-  public void testCount() throws IOException {
-    Directory dir = newDirectory();
-    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
-    Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
-    iwc.setIndexSort(indexSort);
-    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
-    Document doc = new Document();
-    doc.add(new SortedNumericDocValuesField("field", 10));
-    writer.addDocument(doc);
-    IndexReader reader = writer.getReader();
-    IndexSearcher searcher = newSearcher(reader);
-
-    Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
-    Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery);
-    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
-    for (LeafReaderContext context : searcher.getLeafContexts()) {
-      assertEquals(1, weight.count(context));
-    }
-
-    writer.close();
-    reader.close();
-    dir.close();
-  }
-
  public void testFallbackCount() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
@ -509,6 +494,119 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    dir.close();
  }

+  public void testCompareCount() throws IOException {
+    final int iters = atLeast(10);
+    for (int iter = 0; iter < iters; ++iter) {
+      Directory dir = newDirectory();
+      IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+      SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
+      boolean enableMissingValue = random().nextBoolean();
+      if (enableMissingValue) {
+        long missingValue =
+            random().nextBoolean()
+                ? TestUtil.nextLong(random(), -100, 10000)
+                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
+        sortField.setMissingValue(missingValue);
+      }
+      iwc.setIndexSort(new Sort(sortField));
+
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+
+      final int numDocs = atLeast(100);
+      for (int i = 0; i < numDocs; ++i) {
+        Document doc = new Document();
+        final int numValues = TestUtil.nextInt(random(), 0, 1);
+        for (int j = 0; j < numValues; ++j) {
+          final long value = TestUtil.nextLong(random(), -100, 10000);
+          doc = createSNDVAndPointDocument("field", value);
+        }
+        writer.addDocument(doc);
+      }
+
+      if (random().nextBoolean()) {
+        writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
+      }
+
+      final IndexReader reader = writer.getReader();
+      final IndexSearcher searcher = newSearcher(reader);
+      writer.close();
+
+      for (int i = 0; i < 100; ++i) {
+        final long min =
+            random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
+        final long max =
+            random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
+        final Query q1 = LongPoint.newRangeQuery("field", min, max);
+
+        final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
+        final Query q2 =
+            new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery);
+        final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+        final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+        assertSameCount(weight1, weight2, searcher);
+      }
+
+      reader.close();
+      dir.close();
+    }
+  }
+
+  private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher)
+      throws IOException {
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(weight1.count(context), weight2.count(context));
+    }
+  }
+
+  public void testCountBoundary() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
+    SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
+    boolean useLower = random().nextBoolean();
+    long lowerValue = 1;
+    long upperValue = 100;
+    sortField.setMissingValue(useLower ? lowerValue : upperValue);
+    Sort indexSort = new Sort(sortField);
+    iwc.setIndexSort(indexSort);
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
+
+    writer.addDocument(
+        createSNDVAndPointDocument("field", TestUtil.nextLong(random(), lowerValue, upperValue)));
+    writer.addDocument(
+        createSNDVAndPointDocument("field", TestUtil.nextLong(random(), lowerValue, upperValue)));
+    // missingValue
+    writer.addDocument(createMissingValueDocument());
+
+    IndexReader reader = writer.getReader();
+    IndexSearcher searcher = newSearcher(reader);
+
+    Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue);
+    Query query =
+        new IndexSortSortedNumericDocValuesRangeQuery(
+            "field", lowerValue, upperValue, fallbackQuery);
+    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
+    for (LeafReaderContext context : searcher.getLeafContexts()) {
+      assertEquals(2, weight.count(context));
+    }
+
+    writer.close();
+    reader.close();
+    dir.close();
+  }
+
+  private Document createMissingValueDocument() {
+    Document doc = new Document();
+    doc.add(new StringField("foo", "fox", Field.Store.YES));
+    return doc;
+  }
+
+  private Document createSNDVAndPointDocument(String field, long value) {
+    Document doc = new Document();
+    doc.add(new SortedNumericDocValuesField(field, value));
+    doc.add(new LongPoint(field, value));
+    return doc;
+  }
+
  private Document createDocument(String field, long value) {
    Document doc = new Document();
    doc.add(new SortedNumericDocValuesField(field, value));