Take advantage of the doc value skipper when it is primary sort (#13592)

Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
 and SortedSetDocValuesRangeQuery.
This commit is contained in:
Ignacio Vera 2024-08-22 12:28:25 +02:00 committed by GitHub
parent b4a06770c9
commit 579427228d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 356 additions and 17 deletions

View File

@ -141,6 +141,9 @@ New Features
* GITHUB#13604: Add Kmeans clustering on vectors (Mayya Sharipova, Jim Ferenczi, Tom Veasey)
* GITHUB#13597: Take advantage of the doc value skipper when it is primary sort in SortedNumericDocValuesRangeQuery
and SortedSetDocValuesRangeQuery. (Ignacio Vera)
Improvements
---------------------

View File

@ -18,13 +18,16 @@ package org.apache.lucene.document;
import java.io.IOException;
import java.util.Objects;
import java.util.function.LongPredicate;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
@ -32,6 +35,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
@ -116,12 +120,28 @@ final class SortedNumericDocValuesRangeQuery extends Query {
if (skipper.minValue() > upperValue || skipper.maxValue() < lowerValue) {
return null;
}
if (skipper.docCount() == context.reader().maxDoc()
&& skipper.minValue() >= lowerValue
&& skipper.maxValue() <= upperValue) {
final var scorer =
new ConstantScoreScorer(
score(), scoreMode, DocIdSetIterator.all(skipper.docCount()));
return new DefaultScorerSupplier(scorer);
}
}
SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
final NumericDocValues singleton = DocValues.unwrapSingleton(values);
TwoPhaseIterator iterator;
if (singleton != null) {
if (skipper != null) {
final DocIdSetIterator psIterator =
getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper);
if (psIterator != null) {
return new DefaultScorerSupplier(
new ConstantScoreScorer(score(), scoreMode, psIterator));
}
}
iterator =
new TwoPhaseIterator(singleton) {
@Override
@ -166,4 +186,65 @@ final class SortedNumericDocValuesRangeQuery extends Query {
}
};
}
private DocIdSetIterator getDocIdSetIteratorOrNullForPrimarySort(
LeafReader reader, NumericDocValues numericDocValues, DocValuesSkipper skipper)
throws IOException {
if (skipper.docCount() != reader.maxDoc()) {
return null;
}
final Sort indexSort = reader.getMetaData().getSort();
if (indexSort == null
|| indexSort.getSort().length == 0
|| indexSort.getSort()[0].getField().equals(field) == false) {
return null;
}
final int minDocID;
final int maxDocID;
if (indexSort.getSort()[0].getReverse()) {
if (skipper.maxValue() <= upperValue) {
minDocID = 0;
} else {
skipper.advance(Long.MIN_VALUE, upperValue);
minDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l <= upperValue);
}
if (skipper.minValue() >= lowerValue) {
maxDocID = skipper.docCount();
} else {
skipper.advance(Long.MIN_VALUE, lowerValue);
maxDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l < lowerValue);
}
} else {
if (skipper.minValue() >= lowerValue) {
minDocID = 0;
} else {
skipper.advance(lowerValue, Long.MAX_VALUE);
minDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l >= lowerValue);
}
if (skipper.maxValue() <= upperValue) {
maxDocID = skipper.docCount();
} else {
skipper.advance(upperValue, Long.MAX_VALUE);
maxDocID = nextDoc(skipper.minDocID(0), numericDocValues, l -> l > upperValue);
}
}
return minDocID == maxDocID
? DocIdSetIterator.empty()
: DocIdSetIterator.range(minDocID, maxDocID);
}
private static int nextDoc(int startDoc, NumericDocValues docValues, LongPredicate predicate)
throws IOException {
int doc = docValues.docID();
if (startDoc > doc) {
doc = docValues.advance(startDoc);
}
for (; doc < DocIdSetIterator.NO_MORE_DOCS; doc = docValues.nextDoc()) {
if (predicate.test(docValues.longValue())) {
break;
}
}
return doc;
}
}

View File

@ -18,8 +18,10 @@ package org.apache.lucene.document;
import java.io.IOException;
import java.util.Objects;
import java.util.function.LongPredicate;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
@ -33,6 +35,7 @@ import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
@ -150,7 +153,6 @@ final class SortedSetDocValuesRangeQuery extends Query {
}
}
// no terms matched in this segment
// no terms matched in this segment
if (minOrd > maxOrd
|| (skipper != null
@ -158,9 +160,26 @@ final class SortedSetDocValuesRangeQuery extends Query {
return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty());
}
// all terms matched in this segment
if (skipper != null
&& skipper.docCount() == context.reader().maxDoc()
&& skipper.minValue() >= minOrd
&& skipper.maxValue() <= maxOrd) {
return new ConstantScoreScorer(
score(), scoreMode, DocIdSetIterator.all(skipper.docCount()));
}
final SortedDocValues singleton = DocValues.unwrapSingleton(values);
TwoPhaseIterator iterator;
if (singleton != null) {
if (skipper != null) {
final DocIdSetIterator psIterator =
getDocIdSetIteratorOrNullForPrimarySort(
context.reader(), singleton, skipper, minOrd, maxOrd);
if (psIterator != null) {
return new ConstantScoreScorer(score(), scoreMode, psIterator);
}
}
iterator =
new TwoPhaseIterator(singleton) {
@Override
@ -216,4 +235,69 @@ final class SortedSetDocValuesRangeQuery extends Query {
}
};
}
private DocIdSetIterator getDocIdSetIteratorOrNullForPrimarySort(
LeafReader reader,
SortedDocValues sortedDocValues,
DocValuesSkipper skipper,
long minOrd,
long maxOrd)
throws IOException {
if (skipper.docCount() != reader.maxDoc()) {
return null;
}
final Sort indexSort = reader.getMetaData().getSort();
if (indexSort == null
|| indexSort.getSort().length == 0
|| indexSort.getSort()[0].getField().equals(field) == false) {
return null;
}
final int minDocID;
final int maxDocID;
if (indexSort.getSort()[0].getReverse()) {
if (skipper.maxValue() <= maxOrd) {
minDocID = 0;
} else {
skipper.advance(Long.MIN_VALUE, maxOrd);
minDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l <= maxOrd);
}
if (skipper.minValue() >= minOrd) {
maxDocID = skipper.docCount();
} else {
skipper.advance(Long.MIN_VALUE, minOrd);
maxDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l < minOrd);
}
} else {
if (skipper.minValue() >= minOrd) {
minDocID = 0;
} else {
skipper.advance(minOrd, Long.MAX_VALUE);
minDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l >= minOrd);
}
if (skipper.maxValue() <= maxOrd) {
maxDocID = skipper.docCount();
} else {
skipper.advance(maxOrd, Long.MAX_VALUE);
maxDocID = nextDoc(skipper.minDocID(0), sortedDocValues, l -> l > maxOrd);
}
}
return minDocID == maxDocID
? DocIdSetIterator.empty()
: DocIdSetIterator.range(minDocID, maxDocID);
}
private static int nextDoc(int startDoc, SortedDocValues docValues, LongPredicate predicate)
throws IOException {
int doc = docValues.docID();
if (startDoc > doc) {
doc = docValues.advance(startDoc);
}
for (; doc < DocIdSetIterator.NO_MORE_DOCS; doc = docValues.nextDoc()) {
if (predicate.test(docValues.ordValue())) {
break;
}
}
return doc;
}
}

View File

@ -98,4 +98,28 @@ public abstract class DocValuesSkipper {
/** Return the global number of documents with a value for the field. */
public abstract int docCount();
/**
* Advance this skipper so that all levels intersects the range given by {@code minValue} and
* {@code maxValue}. If there are no intersecting levels, the skipper is exhausted.
*/
public final void advance(long minValue, long maxValue) throws IOException {
if (minDocID(0) == -1) {
// #advance has not been called yet
advance(0);
}
// check if the current interval intersects the provided range
while (minDocID(0) != DocIdSetIterator.NO_MORE_DOCS
&& ((minValue(0) > maxValue || maxValue(0) < minValue))) {
int maxDocID = maxDocID(0);
int nextLevel = 1;
// check if the next levels intersects to skip as many docs as possible
while (nextLevel < numLevels()
&& (minValue(nextLevel) > maxValue || maxValue(nextLevel) < minValue)) {
maxDocID = maxDocID(nextLevel);
nextLevel++;
}
advance(maxDocID + 1);
}
}
}

View File

@ -22,6 +22,8 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint;
@ -31,6 +33,7 @@ import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
@ -42,24 +45,80 @@ import org.apache.lucene.util.NumericUtils;
public class TestDocValuesQueries extends LuceneTestCase {
private Codec getCodec() {
// small interval size to test with many intervals
return TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(random().nextInt(4, 16)));
}
public void testDuelPointRangeSortedNumericRangeQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(true, 1);
doTestDuelPointRangeNumericRangeQuery(true, 1, false);
}
public void testDuelPointRangeSortedNumericRangeWithSlipperQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(true, 1, true);
}
public void testDuelPointRangeMultivaluedSortedNumericRangeQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(true, 3);
doTestDuelPointRangeNumericRangeQuery(true, 3, false);
}
public void testDuelPointRangeMultivaluedSortedNumericRangeWithSkipperQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(true, 3, true);
}
public void testDuelPointRangeNumericRangeQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(false, 1);
doTestDuelPointRangeNumericRangeQuery(false, 1, false);
}
private void doTestDuelPointRangeNumericRangeQuery(boolean sortedNumeric, int maxValuesPerDoc)
throws IOException {
public void testDuelPointRangeNumericRangeWithSkipperQuery() throws IOException {
doTestDuelPointRangeNumericRangeQuery(false, 1, true);
}
public void testDuelPointNumericSortedWithSkipperRangeQuery() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(new Sort(new SortField("dv", SortField.Type.LONG, random().nextBoolean())));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, config);
final int numDocs = atLeast(1000);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final long value = TestUtil.nextLong(random(), -100, 10000);
doc.add(NumericDocValuesField.indexedField("dv", value));
doc.add(new LongPoint("idx", value));
iw.addDocument(doc);
}
final IndexReader reader = iw.getReader();
final IndexSearcher searcher = newSearcher(reader, false);
iw.close();
for (int i = 0; i < 100; ++i) {
final long min =
random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
final long max =
random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
final Query q1 = LongPoint.newRangeQuery("idx", min, max);
final Query q2 = NumericDocValuesField.newSlowRangeQuery("dv", min, max);
assertSameMatches(searcher, q1, q2, false);
}
reader.close();
dir.close();
}
private void doTestDuelPointRangeNumericRangeQuery(
boolean sortedNumeric, int maxValuesPerDoc, boolean skypper) throws IOException {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
RandomIndexWriter iw;
if (sortedNumeric || random().nextBoolean()) {
iw = new RandomIndexWriter(random(), dir);
} else {
IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(
new Sort(new SortField("dv", SortField.Type.LONG, random().nextBoolean())));
iw = new RandomIndexWriter(random(), dir, config);
}
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
@ -67,10 +126,18 @@ public class TestDocValuesQueries extends LuceneTestCase {
for (int j = 0; j < numValues; ++j) {
final long value = TestUtil.nextLong(random(), -100, 10000);
if (sortedNumeric) {
if (skypper) {
doc.add(SortedNumericDocValuesField.indexedField("dv", value));
} else {
doc.add(new SortedNumericDocValuesField("dv", value));
}
} else {
if (skypper) {
doc.add(NumericDocValuesField.indexedField("dv", value));
} else {
doc.add(new NumericDocValuesField("dv", value));
}
}
doc.add(new LongPoint("idx", value));
}
iw.addDocument(doc);
@ -102,12 +169,20 @@ public class TestDocValuesQueries extends LuceneTestCase {
}
}
private void doTestDuelPointRangeSortedRangeQuery(boolean sortedSet, int maxValuesPerDoc)
throws IOException {
private void doTestDuelPointRangeSortedRangeQuery(
boolean sortedSet, int maxValuesPerDoc, boolean skypper) throws IOException {
final int iters = atLeast(10);
for (int iter = 0; iter < iters; ++iter) {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
RandomIndexWriter iw;
if (sortedSet || random().nextBoolean()) {
iw = new RandomIndexWriter(random(), dir);
} else {
IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(
new Sort(new SortField("dv", SortField.Type.STRING, random().nextBoolean())));
iw = new RandomIndexWriter(random(), dir, config);
}
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
@ -117,10 +192,18 @@ public class TestDocValuesQueries extends LuceneTestCase {
byte[] encoded = new byte[Long.BYTES];
LongPoint.encodeDimension(value, encoded, 0);
if (sortedSet) {
if (skypper) {
doc.add(SortedSetDocValuesField.indexedField("dv", newBytesRef(encoded)));
} else {
doc.add(new SortedSetDocValuesField("dv", newBytesRef(encoded)));
}
} else {
if (skypper) {
doc.add(SortedDocValuesField.indexedField("dv", newBytesRef(encoded)));
} else {
doc.add(new SortedDocValuesField("dv", newBytesRef(encoded)));
}
}
doc.add(new LongPoint("idx", value));
}
iw.addDocument(doc);
@ -179,15 +262,79 @@ public class TestDocValuesQueries extends LuceneTestCase {
}
public void testDuelPointRangeSortedSetRangeQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(true, 1);
doTestDuelPointRangeSortedRangeQuery(true, 1, false);
}
public void testDuelPointRangeSortedSetRangeSkipperQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(true, 1, true);
}
public void testDuelPointRangeMultivaluedSortedSetRangeQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(true, 3);
doTestDuelPointRangeSortedRangeQuery(true, 3, false);
}
public void testDuelPointRangeMultivaluedSortedSetRangeSkipperQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(true, 3, true);
}
public void testDuelPointRangeSortedRangeQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(false, 1);
doTestDuelPointRangeSortedRangeQuery(false, 1, false);
}
public void testDuelPointRangeSortedRangeSkipperQuery() throws IOException {
doTestDuelPointRangeSortedRangeQuery(false, 1, true);
}
public void testDuelPointSortedSetSortedWithSkipperRangeQuery() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec());
config.setIndexSort(
new Sort(new SortField("dv", SortField.Type.STRING, random().nextBoolean())));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, config);
final int numDocs = atLeast(1000);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final long value = TestUtil.nextLong(random(), -100, 10000);
byte[] encoded = new byte[Long.BYTES];
LongPoint.encodeDimension(value, encoded, 0);
doc.add(SortedDocValuesField.indexedField("dv", newBytesRef(encoded)));
doc.add(new LongPoint("idx", value));
iw.addDocument(doc);
}
final IndexReader reader = iw.getReader();
final IndexSearcher searcher = newSearcher(reader, false);
iw.close();
for (int i = 0; i < 100; ++i) {
long min = random().nextBoolean() ? Long.MIN_VALUE : TestUtil.nextLong(random(), -100, 10000);
long max = random().nextBoolean() ? Long.MAX_VALUE : TestUtil.nextLong(random(), -100, 10000);
byte[] encodedMin = new byte[Long.BYTES];
byte[] encodedMax = new byte[Long.BYTES];
LongPoint.encodeDimension(min, encodedMin, 0);
LongPoint.encodeDimension(max, encodedMax, 0);
boolean includeMin = true;
boolean includeMax = true;
if (random().nextBoolean()) {
includeMin = false;
min++;
}
if (random().nextBoolean()) {
includeMax = false;
max--;
}
final Query q1 = LongPoint.newRangeQuery("idx", min, max);
final Query q2 =
SortedDocValuesField.newSlowRangeQuery(
"dv",
min == Long.MIN_VALUE && random().nextBoolean() ? null : newBytesRef(encodedMin),
max == Long.MAX_VALUE && random().nextBoolean() ? null : newBytesRef(encodedMax),
includeMin,
includeMax);
assertSameMatches(searcher, q1, q2, false);
}
reader.close();
dir.close();
}
private void assertSameMatches(IndexSearcher searcher, Query q1, Query q2, boolean scores)