From 5450d722586b1922f98768b571d42482531d5ab8 Mon Sep 17 00:00:00 2001 From: Lu Xugang Date: Wed, 23 Mar 2022 22:54:52 +0800 Subject: [PATCH] LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables (#736) --- lucene/CHANGES.txt | 8 +- ...xSortSortedNumericDocValuesRangeQuery.java | 39 +++-- ...xSortSortedNumericDocValuesRangeQuery.java | 148 +++++++++++++++--- 3 files changed, 158 insertions(+), 37 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index db1eba13849..435434aaf30 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -56,6 +56,10 @@ API Changes New Features --------------------- + +* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery + to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand) + * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` implementation. `Monitor` can be created with a readonly `QueryIndex` in order to have readonly `Monitor` instances. (Niko Usai) @@ -66,6 +70,7 @@ Improvements Optimizations --------------------- + * LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov) * LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov) @@ -195,9 +200,6 @@ New Features based on TotalHitCountCollector that allows users to parallelize counting the number of hits. (Luca Cavanna, Adrien Grand) -* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery - to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand) - * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller) * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java index 1dc80af399b..38d9314fefd 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/IndexSortSortedNumericDocValuesRangeQuery.java @@ -20,8 +20,10 @@ import java.io.IOException; import java.util.Objects; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; @@ -198,16 +200,18 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query { @Override public int count(LeafReaderContext context) throws IOException { - BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context); - if (disi != null) { - return disi.lastDoc - disi.firstDoc; + if (context.reader().hasDeletions() == false) { + BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context); + if (disi != null && disi.delegate == null) { + return disi.lastDoc - disi.firstDoc; + } } return fallbackWeight.count(context); } }; } - private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context) + private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context) throws IOException { SortedNumericDocValues sortedNumericValues = DocValues.getSortedNumeric(context.reader(), field); @@ -237,7 +241,7 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query { * {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with * no value. */ - private BoundedDocSetIdIterator getDocIdSetIterator( + private BoundedDocIdSetIterator getDocIdSetIterator( SortField sortField, LeafReaderContext context, DocIdSetIterator delegate) throws IOException { long lower = sortField.getReverse() ? upperValue : lowerValue; @@ -278,7 +282,19 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query { } int lastDocIdExclusive = high + 1; - return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate); + Object missingValue = sortField.getMissingValue(); + BoundedDocIdSetIterator disi; + LeafReader reader = context.reader(); + PointValues pointValues = reader.getPointValues(field); + final long missingLongValue = missingValue == null ? 0L : (long) missingValue; + // all documents have docValues or missing value falls outside the range + if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc()) + || (missingLongValue < lowerValue || missingLongValue > upperValue)) { + disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null); + } else { + disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate); + } + return disi; } /** Compares the given document's value with a stored reference value. */ @@ -306,14 +322,14 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query { * A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range * [firstDocInclusive, lastDoc). */ - private static class BoundedDocSetIdIterator extends DocIdSetIterator { + private static class BoundedDocIdSetIterator extends DocIdSetIterator { private final int firstDoc; private final int lastDoc; private final DocIdSetIterator delegate; private int docID = -1; - BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) { + BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) { this.firstDoc = firstDoc; this.lastDoc = lastDoc; this.delegate = delegate; @@ -335,7 +351,12 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query { target = firstDoc; } - int result = delegate.advance(target); + int result; + if (delegate != null) { + result = delegate.advance(target); + } else { + result = target; + } if (result < lastDoc) { docID = result; } else { diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java index 22173748386..d52fadec5be 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestIndexSortSortedNumericDocValuesRangeQuery.java @@ -20,9 +20,11 @@ import static org.hamcrest.CoreMatchers.instanceOf; import java.io.IOException; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -59,7 +61,14 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); boolean reverse = random().nextBoolean(); SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse); - sortField.setMissingValue(random().nextLong()); + boolean enableMissingValue = random().nextBoolean(); + if (enableMissingValue) { + long missingValue = + random().nextBoolean() + ? TestUtil.nextLong(random(), -100, 10000) + : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE); + sortField.setMissingValue(missingValue); + } iwc.setIndexSort(new Sort(sortField)); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); @@ -459,30 +468,6 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas reader.close(); } - public void testCount() throws IOException { - Directory dir = newDirectory(); - IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); - Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG)); - iwc.setIndexSort(indexSort); - RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); - Document doc = new Document(); - doc.add(new SortedNumericDocValuesField("field", 10)); - writer.addDocument(doc); - IndexReader reader = writer.getReader(); - IndexSearcher searcher = newSearcher(reader); - - Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42); - Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery); - Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); - for (LeafReaderContext context : searcher.getLeafContexts()) { - assertEquals(1, weight.count(context)); - } - - writer.close(); - reader.close(); - dir.close(); - } - public void testFallbackCount() throws IOException { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); @@ -509,6 +494,119 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas dir.close(); } + public void testCompareCount() throws IOException { + final int iters = atLeast(10); + for (int iter = 0; iter < iters; ++iter) { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG); + boolean enableMissingValue = random().nextBoolean(); + if (enableMissingValue) { + long missingValue = + random().nextBoolean() + ? TestUtil.nextLong(random(), -100, 10000) + : (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE); + sortField.setMissingValue(missingValue); + } + iwc.setIndexSort(new Sort(sortField)); + + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + + final int numDocs = atLeast(100); + for (int i = 0; i < numDocs; ++i) { + Document doc = new Document(); + final int numValues = TestUtil.nextInt(random(), 0, 1); + for (int j = 0; j < numValues; ++j) { + final long value = TestUtil.nextLong(random(), -100, 10000); + doc = createSNDVAndPointDocument("field", value); + } + writer.addDocument(doc); + } + + if (random().nextBoolean()) { + writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L)); + } + + final IndexReader reader = writer.getReader(); + final IndexSearcher searcher = newSearcher(reader); + writer.close(); + + for (int i = 0; i < 100; ++i) { + final long min = + random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000); + final long max = + random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000); + final Query q1 = LongPoint.newRangeQuery("field", min, max); + + final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max); + final Query q2 = + new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery); + final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); + final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); + assertSameCount(weight1, weight2, searcher); + } + + reader.close(); + dir.close(); + } + } + + private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher) + throws IOException { + for (LeafReaderContext context : searcher.getLeafContexts()) { + assertEquals(weight1.count(context), weight2.count(context)); + } + } + + public void testCountBoundary() throws IOException { + Directory dir = newDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); + SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG); + boolean useLower = random().nextBoolean(); + long lowerValue = 1; + long upperValue = 100; + sortField.setMissingValue(useLower ? lowerValue : upperValue); + Sort indexSort = new Sort(sortField); + iwc.setIndexSort(indexSort); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc); + + writer.addDocument( + createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue))); + writer.addDocument( + createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue))); + // missingValue + writer.addDocument(createMissingValueDocument()); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + + Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue); + Query query = + new IndexSortSortedNumericDocValuesRangeQuery( + "field", lowerValue, upperValue, fallbackQuery); + Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f); + for (LeafReaderContext context : searcher.getLeafContexts()) { + assertEquals(2, weight.count(context)); + } + + writer.close(); + reader.close(); + dir.close(); + } + + private Document createMissingValueDocument() { + Document doc = new Document(); + doc.add(new StringField("foo", "fox", Field.Store.YES)); + return doc; + } + + private Document createSNDVAndPointDocument(String field, long value) { + Document doc = new Document(); + doc.add(new SortedNumericDocValuesField(field, value)); + doc.add(new LongPoint(field, value)); + return doc; + } + private Document createDocument(String field, long value) { Document doc = new Document(); doc.add(new SortedNumericDocValuesField(field, value));