LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)

This commit is contained in:
jianping weng 2022-09-22 14:51:13 +08:00 committed by GitHub
parent bcc116057d
commit 5b24a233bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 282 additions and 0 deletions

View File

@ -101,6 +101,9 @@ Improvements
* GITHUB#11785: Improve Tessellator performance by delaying calls to the method * GITHUB#11785: Improve Tessellator performance by delaying calls to the method
#isIntersectingPolygon (Ignacio Vera) #isIntersectingPolygon (Ignacio Vera)
* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
construction using bkd binary search. (Jianping Weng)
Bug Fixes Bug Fixes
--------------------- ---------------------
* GITHUB#11726: Indexing term vectors on large documents could fail due to * GITHUB#11726: Indexing term vectors on large documents could fail due to

View File

@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;
import java.io.IOException; import java.io.IOException;
import java.util.Objects; import java.util.Objects;
import java.util.function.Predicate;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.ConstantScoreWeight;
@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
/** /**
* A range query that can take advantage of the fact that the index is sorted to speed up execution. * A range query that can take advantage of the fact that the index is sorted to speed up execution.
@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
}; };
} }
/**
* Returns the first document whose packed value is greater than or equal (if allowEqual is true)
* to the provided packed value or -1 if all packed values are smaller than the provided one,
*/
public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
throws IOException {
assert values.getNumDimensions() == 1;
final int bytesPerDim = values.getBytesPerDimension();
final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
final Predicate<byte[]> biggerThan =
testPackedValue -> {
int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
return cmp > 0 || (cmp == 0 && allowEqual);
};
return nextDoc(values.getPointTree(), biggerThan);
}
private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
throws IOException {
if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
// doc is before us
return -1;
} else if (pointTree.moveToChild()) {
// navigate down
do {
final int doc = nextDoc(pointTree, biggerThan);
if (doc != -1) {
return doc;
}
} while (pointTree.moveToSibling());
pointTree.moveToParent();
return -1;
} else {
// doc is in this leaf
final int[] doc = {-1};
pointTree.visitDocValues(
new IntersectVisitor() {
@Override
public void visit(int docID) {
throw new AssertionError("Invalid call to visit(docID)");
}
@Override
public void visit(int docID, byte[] packedValue) {
if (doc[0] == -1 && biggerThan.test(packedValue)) {
doc[0] = docID;
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
return Relation.CELL_CROSSES_QUERY;
}
});
return doc[0];
}
}
private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
throws IOException {
final ByteArrayComparator comparator =
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
int offset = dim * points.getBytesPerDimension();
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
|| comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
return true;
}
}
return false;
}
private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
throws IOException {
final ByteArrayComparator comparator =
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
int offset = dim * points.getBytesPerDimension();
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
return false;
}
if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
return false;
}
if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
|| comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
return false;
}
}
return true;
}
private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
Sort indexSort = context.reader().getMetaData().getSort();
if (indexSort != null
&& indexSort.getSort().length > 0
&& indexSort.getSort()[0].getField().equals(field)
&& indexSort.getSort()[0].getReverse() == false) {
PointValues points = context.reader().getPointValues(field);
if (points == null) {
return null;
}
if (points.getNumDimensions() != 1) {
return null;
}
if (points.getBytesPerDimension() != Long.BYTES
&& points.getBytesPerDimension() != Integer.BYTES) {
return null;
}
// Each doc that has points has exactly one point.
if (points.size() == points.getDocCount()) {
byte[] queryLowerPoint;
byte[] queryUpperPoint;
if (points.getBytesPerDimension() == Integer.BYTES) {
queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
} else {
queryLowerPoint = LongPoint.pack(lowerValue).bytes;
queryUpperPoint = LongPoint.pack(upperValue).bytes;
}
if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
return new BoundedDocIdSetIterator(0, 0, null);
}
int minDocId, maxDocId;
if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
minDocId = 0;
maxDocId = context.reader().maxDoc();
} else {
// >=queryLowerPoint
minDocId = nextDoc(points, queryLowerPoint, true);
if (minDocId == -1) {
return new BoundedDocIdSetIterator(0, 0, null);
}
// >queryUpperPoint,
maxDocId = nextDoc(points, queryUpperPoint, false);
if (maxDocId == -1) {
maxDocId = context.reader().maxDoc();
}
}
if ((points.getDocCount() == context.reader().maxDoc())) {
return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
} else {
return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
}
}
}
return null;
}
private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context) private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
throws IOException { throws IOException {
SortedNumericDocValues sortedNumericValues = SortedNumericDocValues sortedNumericValues =
DocValues.getSortedNumeric(context.reader(), field); DocValues.getSortedNumeric(context.reader(), field);
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues); NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
if (numericValues != null) { if (numericValues != null) {
BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
if (iterator != null) {
return iterator;
}
Sort indexSort = context.reader().getMetaData().getSort(); Sort indexSort = context.reader().getMetaData().getSort();
if (indexSort != null if (indexSort != null
&& indexSort.getSort().length > 0 && indexSort.getSort().length > 0

View File

@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
import static org.hamcrest.CoreMatchers.instanceOf; import static org.hamcrest.CoreMatchers.instanceOf;
import java.io.IOException; import java.io.IOException;
import java.util.Random;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.LongPoint;
@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
return new IndexSortSortedNumericDocValuesRangeQuery( return new IndexSortSortedNumericDocValuesRangeQuery(
field, lowerValue, upperValue, fallbackQuery); field, lowerValue, upperValue, fallbackQuery);
} }
public void testCountWithBkd() throws IOException {
String filedName = "field";
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
addDocWithBkd(writer, filedName, 6, 500);
addDocWithBkd(writer, filedName, 5, 500);
addDocWithBkd(writer, filedName, 8, 500);
addDocWithBkd(writer, filedName, 9, 500);
addDocWithBkd(writer, filedName, 7, 500);
writer.flush();
writer.forceMerge(1);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(1500, weight.count(context));
}
fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(2000, weight.count(context));
}
fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(1000, weight.count(context));
}
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(2500, weight.count(context));
}
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(0, weight.count(context));
}
fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(0, weight.count(context));
}
writer.close();
reader.close();
dir.close();
}
public void testRandomCountWithBkd() throws IOException {
String filedName = "field";
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
iwc.setIndexSort(indexSort);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Random random = random();
for (int i = 0; i < 100; i++) {
addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
}
writer.flush();
writer.forceMerge(1);
IndexReader reader = writer.getReader();
IndexSearcher searcher = newSearcher(reader);
for (int i = 0; i < 100; i++) {
int random1 = random.nextInt(1100);
int random2 = random.nextInt(1100);
int low = Math.min(random1, random2);
int upper = Math.max(random1, random2);
Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
Query indexSortRangeQuery =
new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
Weight indexSortRangeQueryWeight =
indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
for (LeafReaderContext context : searcher.getLeafContexts()) {
assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
}
}
writer.close();
reader.close();
dir.close();
}
private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
throws IOException {
for (int i = 0; i < repeat; i++) {
Document doc = new Document();
doc.add(new SortedNumericDocValuesField(field, value));
doc.add(new LongPoint(field, value));
indexWriter.addDocument(doc);
}
}
} }