LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables (#736)

2022-03-23 22:54:52 +08:00 · 2022-03-23 22:54:52 +08:00 · 5450d72258
parent 1c6f631678
commit 5450d72258
3 changed files with 158 additions and 37 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -56,6 +56,10 @@ API Changes
 New Features
 ---------------------
 * LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
  to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
 * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` 
 implementation. `Monitor` can be created with a readonly `QueryIndex` in order to 
 have readonly `Monitor` instances. (Niko Usai)
@ -66,6 +70,7 @@ Improvements
 Optimizations
 ---------------------
 * LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov)
 * LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov)
@ -195,9 +200,6 @@ New Features
  based on TotalHitCountCollector that allows users to parallelize counting the
  number of hits. (Luca Cavanna, Adrien Grand)
 * LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
  to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand)
 * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)
 * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, 
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java
@ -20,8 +20,10 @@ import java.io.IOException;
 import java.util.Objects;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.SortedNumericDocValues;
 import org.apache.lucene.search.ConstantScoreScorer;
 import org.apache.lucene.search.ConstantScoreWeight;
@ -198,16 +200,18 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
      @Override
      public int count(LeafReaderContext context) throws IOException {
-        BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context);
+        if (context.reader().hasDeletions() == false) {
-        if (disi != null) {
+          BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context);
-          return disi.lastDoc - disi.firstDoc;
+          if (disi != null && disi.delegate == null) {
            return disi.lastDoc - disi.firstDoc;
          }
        }
        return fallbackWeight.count(context);
      }
    };
  }
-  private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
+  private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
      throws IOException {
    SortedNumericDocValues sortedNumericValues =
        DocValues.getSortedNumeric(context.reader(), field);
@ -237,7 +241,7 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
   * {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with
   * no value.
   */
-  private BoundedDocSetIdIterator getDocIdSetIterator(
+  private BoundedDocIdSetIterator getDocIdSetIterator(
      SortField sortField, LeafReaderContext context, DocIdSetIterator delegate)
      throws IOException {
    long lower = sortField.getReverse() ? upperValue : lowerValue;
@ -278,7 +282,19 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
    }
    int lastDocIdExclusive = high + 1;
-    return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
+    Object missingValue = sortField.getMissingValue();
    BoundedDocIdSetIterator disi;
    LeafReader reader = context.reader();
    PointValues pointValues = reader.getPointValues(field);
    final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
    // all documents have docValues or missing value falls outside the range
    if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
        || (missingLongValue < lowerValue || missingLongValue > upperValue)) {
      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null);
    } else {
      disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
    }
    return disi;
  }
  /** Compares the given document's value with a stored reference value. */
@ -306,14 +322,14 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
   * A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range
   * [firstDocInclusive, lastDoc).
   */
-  private static class BoundedDocSetIdIterator extends DocIdSetIterator {
+  private static class BoundedDocIdSetIterator extends DocIdSetIterator {
    private final int firstDoc;
    private final int lastDoc;
    private final DocIdSetIterator delegate;
    private int docID = -1;
-    BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
+    BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
      this.firstDoc = firstDoc;
      this.lastDoc = lastDoc;
      this.delegate = delegate;
@ -335,7 +351,12 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
        target = firstDoc;
      }
-      int result = delegate.advance(target);
+      int result;
      if (delegate != null) {
        result = delegate.advance(target);
      } else {
        result = target;
      }
      if (result < lastDoc) {
        docID = result;
      } else {
--- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java
@ -20,9 +20,11 @@ import static org.hamcrest.CoreMatchers.instanceOf;
 import java.io.IOException;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.SortedSetDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriterConfig;
@ -59,7 +61,14 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
      IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
      boolean reverse = random().nextBoolean();
      SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse);
-      sortField.setMissingValue(random().nextLong());
+      boolean enableMissingValue = random().nextBoolean();
      if (enableMissingValue) {
        long missingValue =
            random().nextBoolean()
                ? TestUtil.nextLong(random(), -100, 10000)
                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
        sortField.setMissingValue(missingValue);
      }
      iwc.setIndexSort(new Sort(sortField));
      RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -459,30 +468,6 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    reader.close();
  }
  public void testCount() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
    iwc.setIndexSort(indexSort);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    Document doc = new Document();
    doc.add(new SortedNumericDocValuesField("field", 10));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
    Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery);
    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(1, weight.count(context));
    }
    writer.close();
    reader.close();
    dir.close();
  }
  public void testFallbackCount() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
@ -509,6 +494,119 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
    dir.close();
  }
  public void testCompareCount() throws IOException {
    final int iters = atLeast(10);
    for (int iter = 0; iter < iters; ++iter) {
      Directory dir = newDirectory();
      IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
      SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
      boolean enableMissingValue = random().nextBoolean();
      if (enableMissingValue) {
        long missingValue =
            random().nextBoolean()
                ? TestUtil.nextLong(random(), -100, 10000)
                : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
        sortField.setMissingValue(missingValue);
      }
      iwc.setIndexSort(new Sort(sortField));
      RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
      final int numDocs = atLeast(100);
      for (int i = 0; i < numDocs; ++i) {
        Document doc = new Document();
        final int numValues = TestUtil.nextInt(random(), 0, 1);
        for (int j = 0; j < numValues; ++j) {
          final long value = TestUtil.nextLong(random(), -100, 10000);
          doc = createSNDVAndPointDocument("field", value);
        }
        writer.addDocument(doc);
      }
      if (random().nextBoolean()) {
        writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
      }
      final IndexReader reader = writer.getReader();
      final IndexSearcher searcher = newSearcher(reader);
      writer.close();
      for (int i = 0; i < 100; ++i) {
        final long min =
            random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
        final long max =
            random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
        final Query q1 = LongPoint.newRangeQuery("field", min, max);
        final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
        final Query q2 =
            new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery);
        final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
        final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
        assertSameCount(weight1, weight2, searcher);
      }
      reader.close();
      dir.close();
    }
  }
  private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher)
      throws IOException {
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(weight1.count(context), weight2.count(context));
    }
  }
  public void testCountBoundary() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
    SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
    boolean useLower = random().nextBoolean();
    long lowerValue = 1;
    long upperValue = 100;
    sortField.setMissingValue(useLower ? lowerValue : upperValue);
    Sort indexSort = new Sort(sortField);
    iwc.setIndexSort(indexSort);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
    writer.addDocument(
        createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
    writer.addDocument(
        createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
    // missingValue
    writer.addDocument(createMissingValueDocument());
    IndexReader reader = writer.getReader();
    IndexSearcher searcher = newSearcher(reader);
    Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue);
    Query query =
        new IndexSortSortedNumericDocValuesRangeQuery(
            "field", lowerValue, upperValue, fallbackQuery);
    Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
    for (LeafReaderContext context : searcher.getLeafContexts()) {
      assertEquals(2, weight.count(context));
    }
    writer.close();
    reader.close();
    dir.close();
  }
  private Document createMissingValueDocument() {
    Document doc = new Document();
    doc.add(new StringField("foo", "fox", Field.Store.YES));
    return doc;
  }
  private Document createSNDVAndPointDocument(String field, long value) {
    Document doc = new Document();
    doc.add(new SortedNumericDocValuesField(field, value));
    doc.add(new LongPoint(field, value));
    return doc;
  }
  private Document createDocument(String field, long value) {
    Document doc = new Document();
    doc.add(new SortedNumericDocValuesField(field, value));