LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables (#736)

This commit is contained in:
Lu Xugang 2022-03-23 22:54:52 +08:00 committed by Adrien Grand
parent 691760be93
commit 226880dd33
3 changed files with 158 additions and 37 deletions

View File

@ -11,6 +11,10 @@ API Changes
New Features
---------------------
* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory`
implementation. `Monitor` can be created with a readonly `QueryIndex` in order to
have readonly `Monitor` instances. (Niko Usai)
@ -21,6 +25,7 @@ Improvements
Optimizations
---------------------
* LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov)
* LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov)
@ -150,9 +155,6 @@ New Features
based on TotalHitCountCollector that allows users to parallelize counting the
number of hits. (Luca Cavanna, Adrien Grand)
* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand)
* LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)
* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss,

View File

@ -20,8 +20,10 @@ import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
@ -198,16 +200,18 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
@Override
public int count(LeafReaderContext context) throws IOException {
BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context);
if (disi != null) {
return disi.lastDoc - disi.firstDoc;
if (context.reader().hasDeletions() == false) {
BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context);
if (disi != null && disi.delegate == null) {
return disi.lastDoc - disi.firstDoc;
}
}
return fallbackWeight.count(context);
}
};
}
private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
throws IOException {
SortedNumericDocValues sortedNumericValues =
DocValues.getSortedNumeric(context.reader(), field);
@ -237,7 +241,7 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
* {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with
* no value.
*/
private BoundedDocSetIdIterator getDocIdSetIterator(
private BoundedDocIdSetIterator getDocIdSetIterator(
SortField sortField, LeafReaderContext context, DocIdSetIterator delegate)
throws IOException {
long lower = sortField.getReverse() ? upperValue : lowerValue;
@ -278,7 +282,19 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
}
int lastDocIdExclusive = high + 1;
return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
Object missingValue = sortField.getMissingValue();
BoundedDocIdSetIterator disi;
LeafReader reader = context.reader();
PointValues pointValues = reader.getPointValues(field);
final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
// all documents have docValues or missing value falls outside the range
if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
|| (missingLongValue < lowerValue || missingLongValue > upperValue)) {
disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null);
} else {
disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
}
return disi;
}
/** Compares the given document's value with a stored reference value. */
@ -306,14 +322,14 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
* A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range
* [firstDocInclusive, lastDoc).
*/
private static class BoundedDocSetIdIterator extends DocIdSetIterator {
private static class BoundedDocIdSetIterator extends DocIdSetIterator {
private final int firstDoc;
private final int lastDoc;
private final DocIdSetIterator delegate;
private int docID = -1;
BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
this.firstDoc = firstDoc;
this.lastDoc = lastDoc;
this.delegate = delegate;
@ -335,7 +351,12 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
target = firstDoc;
}
int result = delegate.advance(target);
int result;
if (delegate != null) {
result = delegate.advance(target);
} else {
result = target;
}
if (result < lastDoc) {
docID = result;
} else {

View File

@ -20,9 +20,11 @@ import static org.hamcrest.CoreMatchers.instanceOf;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
@ -59,7 +61,14 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
boolean reverse = random().nextBoolean();
SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse);
sortField.setMissingValue(random().nextLong());
boolean enableMissingValue = random().nextBoolean();
if (enableMissingValue) {
long missingValue =
random().nextBoolean()
? TestUtil.nextLong(random(), -100, 10000)
: (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
sortField.setMissingValue(missingValue);
}
iwc.setIndexSort(new Sort(sortField));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -459,30 +468,6 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
reader.close();
}
public void testCount() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(new SortedNumericDocValuesField("field", 10));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery);
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(1, weight.count(context));
}
writer.close();
reader.close();
dir.close();
}
public void testFallbackCount() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
@ -509,6 +494,119 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
dir.close();
}
public void testCompareCount() throws IOException {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
boolean enableMissingValue = random().nextBoolean();
if (enableMissingValue) {
long missingValue =
random().nextBoolean()
? TestUtil.nextLong(random(), -100, 10000)
: (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
sortField.setMissingValue(missingValue);
}
iwc.setIndexSort(new Sort(sortField));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final int numValues = TestUtil.nextInt(random(), 0, 1);
for (int j = 0; j < numValues; ++j) {
final long value = TestUtil.nextLong(random(), -100, 10000);
doc = createSNDVAndPointDocument("field", value);
}
writer.addDocument(doc);
}
if (random().nextBoolean()) {
writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
}
final IndexReader reader = writer.getReader();
final IndexSearcher searcher = newSearcher(reader);
writer.close();
for (int i = 0; i < 100; ++i) {
final long min =
random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
final long max =
random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
final Query q1 = LongPoint.newRangeQuery("field", min, max);
final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
final Query q2 =
new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery);
final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
assertSameCount(weight1, weight2, searcher);
}
reader.close();
dir.close();
}
}
private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher)
throws IOException {
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(weight1.count(context), weight2.count(context));
}
}
public void testCountBoundary() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
boolean useLower = random().nextBoolean();
long lowerValue = 1;
long upperValue = 100;
sortField.setMissingValue(useLower ? lowerValue : upperValue);
Sort indexSort = new Sort(sortField);
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
writer.addDocument(
createSNDVAndPointDocument("field", TestUtil.nextLong(random(), lowerValue, upperValue)));
writer.addDocument(
createSNDVAndPointDocument("field", TestUtil.nextLong(random(), lowerValue, upperValue)));
// missingValue
writer.addDocument(createMissingValueDocument());
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue);
Query query =
new IndexSortSortedNumericDocValuesRangeQuery(
"field", lowerValue, upperValue, fallbackQuery);
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(2, weight.count(context));
}
writer.close();
reader.close();
dir.close();
}
private Document createMissingValueDocument() {
Document doc = new Document();
doc.add(new StringField("foo", "fox", Field.Store.YES));
return doc;
}
private Document createSNDVAndPointDocument(String field, long value) {
Document doc = new Document();
doc.add(new SortedNumericDocValuesField(field, value));
doc.add(new LongPoint(field, value));
return doc;
}
private Document createDocument(String field, long value) {
Document doc = new Document();
doc.add(new SortedNumericDocValuesField(field, value));