LUCENE-10458: BoundedDocSetIdIterator may supply error count in Weigth#count(LeafReaderContext) when missingValue enables (#736)

This commit is contained in:
Lu Xugang 2022-03-23 22:54:52 +08:00 committed by GitHub
parent 1c6f631678
commit 5450d72258
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 158 additions and 37 deletions

View File

@ -56,6 +56,10 @@ API Changes
New Features New Features
--------------------- ---------------------
* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
to speed up computing the number of hits when possible. (Lu Xugang, Luca Cavanna, Adrien Grand)
* LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory` * LUCENE-10422: Monitor Improvements: `Monitor` can use a custom `Directory`
implementation. `Monitor` can be created with a readonly `QueryIndex` in order to implementation. `Monitor` can be created with a readonly `QueryIndex` in order to
have readonly `Monitor` instances. (Niko Usai) have readonly `Monitor` instances. (Niko Usai)
@ -66,6 +70,7 @@ Improvements
Optimizations Optimizations
--------------------- ---------------------
* LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov) * LUCENE-10452: Hunspell: call checkCanceled less frequently to reduce the overhead (Peter Gromov)
* LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov) * LUCENE-10451: Hunspell: don't perform potentially expensive spellchecking after timeout (Peter Gromov)
@ -195,9 +200,6 @@ New Features
based on TotalHitCountCollector that allows users to parallelize counting the based on TotalHitCountCollector that allows users to parallelize counting the
number of hits. (Luca Cavanna, Adrien Grand) number of hits. (Luca Cavanna, Adrien Grand)
* LUCENE-10385: Implement Weight#count on IndexSortSortedNumericDocValuesRangeQuery
to speed up computing the number of hits when possible. (Luca Cavanna, Adrien Grand)
* LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller) * LUCENE-10403: Add ArrayUtil#grow(T[]). (Greg Miller)
* LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss, * LUCENE-10414: Add fn:fuzzyTerm interval function to flexible query parser (Dawid Weiss,

View File

@ -20,8 +20,10 @@ import java.io.IOException;
import java.util.Objects; import java.util.Objects;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.ConstantScoreWeight;
@ -198,16 +200,18 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
@Override @Override
public int count(LeafReaderContext context) throws IOException { public int count(LeafReaderContext context) throws IOException {
BoundedDocSetIdIterator disi = getDocIdSetIteratorOrNull(context); if (context.reader().hasDeletions() == false) {
if (disi != null) { BoundedDocIdSetIterator disi = getDocIdSetIteratorOrNull(context);
return disi.lastDoc - disi.firstDoc; if (disi != null && disi.delegate == null) {
return disi.lastDoc - disi.firstDoc;
}
} }
return fallbackWeight.count(context); return fallbackWeight.count(context);
} }
}; };
} }
private BoundedDocSetIdIterator getDocIdSetIteratorOrNull(LeafReaderContext context) private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
throws IOException { throws IOException {
SortedNumericDocValues sortedNumericValues = SortedNumericDocValues sortedNumericValues =
DocValues.getSortedNumeric(context.reader(), field); DocValues.getSortedNumeric(context.reader(), field);
@ -237,7 +241,7 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
* {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with * {@link DocIdSetIterator} makes sure to wrap the original docvalues to skip over documents with
* no value. * no value.
*/ */
private BoundedDocSetIdIterator getDocIdSetIterator( private BoundedDocIdSetIterator getDocIdSetIterator(
SortField sortField, LeafReaderContext context, DocIdSetIterator delegate) SortField sortField, LeafReaderContext context, DocIdSetIterator delegate)
throws IOException { throws IOException {
long lower = sortField.getReverse() ? upperValue : lowerValue; long lower = sortField.getReverse() ? upperValue : lowerValue;
@ -278,7 +282,19 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
} }
int lastDocIdExclusive = high + 1; int lastDocIdExclusive = high + 1;
return new BoundedDocSetIdIterator(firstDocIdInclusive, lastDocIdExclusive, delegate); Object missingValue = sortField.getMissingValue();
BoundedDocIdSetIterator disi;
LeafReader reader = context.reader();
PointValues pointValues = reader.getPointValues(field);
final long missingLongValue = missingValue == null ? 0L : (long) missingValue;
// all documents have docValues or missing value falls outside the range
if ((pointValues != null && pointValues.getDocCount() == reader.maxDoc())
|| (missingLongValue < lowerValue || missingLongValue > upperValue)) {
disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, null);
} else {
disi = new BoundedDocIdSetIterator(firstDocIdInclusive, lastDocIdExclusive, delegate);
}
return disi;
} }
/** Compares the given document's value with a stored reference value. */ /** Compares the given document's value with a stored reference value. */
@ -306,14 +322,14 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
* A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range * A doc ID set iterator that wraps a delegate iterator and only returns doc IDs in the range
* [firstDocInclusive, lastDoc). * [firstDocInclusive, lastDoc).
*/ */
private static class BoundedDocSetIdIterator extends DocIdSetIterator { private static class BoundedDocIdSetIterator extends DocIdSetIterator {
private final int firstDoc; private final int firstDoc;
private final int lastDoc; private final int lastDoc;
private final DocIdSetIterator delegate; private final DocIdSetIterator delegate;
private int docID = -1; private int docID = -1;
BoundedDocSetIdIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) { BoundedDocIdSetIterator(int firstDoc, int lastDoc, DocIdSetIterator delegate) {
this.firstDoc = firstDoc; this.firstDoc = firstDoc;
this.lastDoc = lastDoc; this.lastDoc = lastDoc;
this.delegate = delegate; this.delegate = delegate;
@ -335,7 +351,12 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
target = firstDoc; target = firstDoc;
} }
int result = delegate.advance(target); int result;
if (delegate != null) {
result = delegate.advance(target);
} else {
result = target;
}
if (result < lastDoc) { if (result < lastDoc) {
docID = result; docID = result;
} else { } else {

View File

@ -20,9 +20,11 @@ import static org.hamcrest.CoreMatchers.instanceOf;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
@ -59,7 +61,14 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
boolean reverse = random().nextBoolean(); boolean reverse = random().nextBoolean();
SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse); SortField sortField = new SortedNumericSortField("dv", SortField.Type.LONG, reverse);
sortField.setMissingValue(random().nextLong()); boolean enableMissingValue = random().nextBoolean();
if (enableMissingValue) {
long missingValue =
random().nextBoolean()
? TestUtil.nextLong(random(), -100, 10000)
: (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
sortField.setMissingValue(missingValue);
}
iwc.setIndexSort(new Sort(sortField)); iwc.setIndexSort(new Sort(sortField));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
@ -459,30 +468,6 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
reader.close(); reader.close();
} }
public void testCount() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortedNumericSortField("field", SortField.Type.LONG));
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
doc.add(new SortedNumericDocValuesField("field", 10));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
Query fallbackQuery = LongPoint.newRangeQuery("field", 1, 42);
Query query = new IndexSortSortedNumericDocValuesRangeQuery("field", 1, 42, fallbackQuery);
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(1, weight.count(context));
}
writer.close();
reader.close();
dir.close();
}
public void testFallbackCount() throws IOException { public void testFallbackCount() throws IOException {
Directory dir = newDirectory(); Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
@ -509,6 +494,119 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
dir.close(); dir.close();
} }
public void testCompareCount() throws IOException {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
boolean enableMissingValue = random().nextBoolean();
if (enableMissingValue) {
long missingValue =
random().nextBoolean()
? TestUtil.nextLong(random(), -100, 10000)
: (random().nextBoolean() ? Long.MIN_VALUE : Long.MAX_VALUE);
sortField.setMissingValue(missingValue);
}
iwc.setIndexSort(new Sort(sortField));
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final int numValues = TestUtil.nextInt(random(), 0, 1);
for (int j = 0; j < numValues; ++j) {
final long value = TestUtil.nextLong(random(), -100, 10000);
doc = createSNDVAndPointDocument("field", value);
}
writer.addDocument(doc);
}
if (random().nextBoolean()) {
writer.deleteDocuments(LongPoint.newRangeQuery("field", 0L, 10L));
}
final IndexReader reader = writer.getReader();
final IndexSearcher searcher = newSearcher(reader);
writer.close();
for (int i = 0; i < 100; ++i) {
final long min =
random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
final long max =
random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
final Query q1 = LongPoint.newRangeQuery("field", min, max);
final Query fallbackQuery = LongPoint.newRangeQuery("field", min, max);
final Query q2 =
new IndexSortSortedNumericDocValuesRangeQuery("field", min, max, fallbackQuery);
final Weight weight1 = q1.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
final Weight weight2 = q2.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
assertSameCount(weight1, weight2, searcher);
}
reader.close();
dir.close();
}
}
private void assertSameCount(Weight weight1, Weight weight2, IndexSearcher searcher)
throws IOException {
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(weight1.count(context), weight2.count(context));
}
}
public void testCountBoundary() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
SortField sortField = new SortedNumericSortField("field", SortField.Type.LONG);
boolean useLower = random().nextBoolean();
long lowerValue = 1;
long upperValue = 100;
sortField.setMissingValue(useLower ? lowerValue : upperValue);
Sort indexSort = new Sort(sortField);
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
writer.addDocument(
createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
writer.addDocument(
createSNDVAndPointDocument("field", random().nextLong(lowerValue, upperValue)));
// missingValue
writer.addDocument(createMissingValueDocument());
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
Query fallbackQuery = LongPoint.newRangeQuery("field", lowerValue, upperValue);
Query query =
new IndexSortSortedNumericDocValuesRangeQuery(
"field", lowerValue, upperValue, fallbackQuery);
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(2, weight.count(context));
}
writer.close();
reader.close();
dir.close();
}
private Document createMissingValueDocument() {
Document doc = new Document();
doc.add(new StringField("foo", "fox", Field.Store.YES));
return doc;
}
private Document createSNDVAndPointDocument(String field, long value) {
Document doc = new Document();
doc.add(new SortedNumericDocValuesField(field, value));
doc.add(new LongPoint(field, value));
return doc;
}
private Document createDocument(String field, long value) { private Document createDocument(String field, long value) {
Document doc = new Document(); Document doc = new Document();
doc.add(new SortedNumericDocValuesField(field, value)); doc.add(new SortedNumericDocValuesField(field, value));