mirror of https://github.com/apache/lucene.git
LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)
This commit is contained in:
parent
bcc116057d
commit
5b24a233bd
|
@ -101,6 +101,9 @@ Improvements
|
|||
* GITHUB#11785: Improve Tessellator performance by delaying calls to the method
|
||||
#isIntersectingPolygon (Ignacio Vera)
|
||||
|
||||
* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
|
||||
construction using bkd binary search. (Jianping Weng)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
* GITHUB#11726: Indexing term vectors on large documents could fail due to
|
||||
|
|
|
@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import org.apache.lucene.document.IntPoint;
|
||||
import org.apache.lucene.document.LongPoint;
|
||||
import org.apache.lucene.index.DocValues;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||
import org.apache.lucene.index.PointValues.Relation;
|
||||
import org.apache.lucene.index.SortedNumericDocValues;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.ConstantScoreWeight;
|
||||
|
@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
|
|||
import org.apache.lucene.search.SortField.Type;
|
||||
import org.apache.lucene.search.SortedNumericSortField;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
|
||||
|
||||
/**
|
||||
* A range query that can take advantage of the fact that the index is sorted to speed up execution.
|
||||
|
@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the first document whose packed value is greater than or equal (if allowEqual is true)
|
||||
* to the provided packed value or -1 if all packed values are smaller than the provided one,
|
||||
*/
|
||||
public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
|
||||
throws IOException {
|
||||
assert values.getNumDimensions() == 1;
|
||||
final int bytesPerDim = values.getBytesPerDimension();
|
||||
final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
|
||||
final Predicate<byte[]> biggerThan =
|
||||
testPackedValue -> {
|
||||
int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
|
||||
return cmp > 0 || (cmp == 0 && allowEqual);
|
||||
};
|
||||
return nextDoc(values.getPointTree(), biggerThan);
|
||||
}
|
||||
|
||||
private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
|
||||
throws IOException {
|
||||
if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
|
||||
// doc is before us
|
||||
return -1;
|
||||
} else if (pointTree.moveToChild()) {
|
||||
// navigate down
|
||||
do {
|
||||
final int doc = nextDoc(pointTree, biggerThan);
|
||||
if (doc != -1) {
|
||||
return doc;
|
||||
}
|
||||
} while (pointTree.moveToSibling());
|
||||
pointTree.moveToParent();
|
||||
return -1;
|
||||
} else {
|
||||
// doc is in this leaf
|
||||
final int[] doc = {-1};
|
||||
pointTree.visitDocValues(
|
||||
new IntersectVisitor() {
|
||||
@Override
|
||||
public void visit(int docID) {
|
||||
throw new AssertionError("Invalid call to visit(docID)");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(int docID, byte[] packedValue) {
|
||||
if (doc[0] == -1 && biggerThan.test(packedValue)) {
|
||||
doc[0] = docID;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||
return Relation.CELL_CROSSES_QUERY;
|
||||
}
|
||||
});
|
||||
return doc[0];
|
||||
}
|
||||
}
|
||||
|
||||
private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
|
||||
throws IOException {
|
||||
final ByteArrayComparator comparator =
|
||||
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
|
||||
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
|
||||
int offset = dim * points.getBytesPerDimension();
|
||||
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
|
||||
|| comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
|
||||
throws IOException {
|
||||
final ByteArrayComparator comparator =
|
||||
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
|
||||
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
|
||||
int offset = dim * points.getBytesPerDimension();
|
||||
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
|
||||
return false;
|
||||
}
|
||||
if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
|
||||
return false;
|
||||
}
|
||||
if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
|
||||
|| comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
|
||||
LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
|
||||
Sort indexSort = context.reader().getMetaData().getSort();
|
||||
if (indexSort != null
|
||||
&& indexSort.getSort().length > 0
|
||||
&& indexSort.getSort()[0].getField().equals(field)
|
||||
&& indexSort.getSort()[0].getReverse() == false) {
|
||||
PointValues points = context.reader().getPointValues(field);
|
||||
if (points == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (points.getNumDimensions() != 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (points.getBytesPerDimension() != Long.BYTES
|
||||
&& points.getBytesPerDimension() != Integer.BYTES) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Each doc that has points has exactly one point.
|
||||
if (points.size() == points.getDocCount()) {
|
||||
|
||||
byte[] queryLowerPoint;
|
||||
byte[] queryUpperPoint;
|
||||
if (points.getBytesPerDimension() == Integer.BYTES) {
|
||||
queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
|
||||
queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
|
||||
} else {
|
||||
queryLowerPoint = LongPoint.pack(lowerValue).bytes;
|
||||
queryUpperPoint = LongPoint.pack(upperValue).bytes;
|
||||
}
|
||||
if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
|
||||
return new BoundedDocIdSetIterator(0, 0, null);
|
||||
}
|
||||
int minDocId, maxDocId;
|
||||
if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
|
||||
minDocId = 0;
|
||||
maxDocId = context.reader().maxDoc();
|
||||
} else {
|
||||
// >=queryLowerPoint
|
||||
minDocId = nextDoc(points, queryLowerPoint, true);
|
||||
|
||||
if (minDocId == -1) {
|
||||
return new BoundedDocIdSetIterator(0, 0, null);
|
||||
}
|
||||
// >queryUpperPoint,
|
||||
maxDocId = nextDoc(points, queryUpperPoint, false);
|
||||
if (maxDocId == -1) {
|
||||
maxDocId = context.reader().maxDoc();
|
||||
}
|
||||
}
|
||||
|
||||
if ((points.getDocCount() == context.reader().maxDoc())) {
|
||||
return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
|
||||
} else {
|
||||
return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
|
||||
throws IOException {
|
||||
SortedNumericDocValues sortedNumericValues =
|
||||
DocValues.getSortedNumeric(context.reader(), field);
|
||||
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
|
||||
if (numericValues != null) {
|
||||
BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
|
||||
if (iterator != null) {
|
||||
return iterator;
|
||||
}
|
||||
Sort indexSort = context.reader().getMetaData().getSort();
|
||||
if (indexSort != null
|
||||
&& indexSort.getSort().length > 0
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
|
|||
import static org.hamcrest.CoreMatchers.instanceOf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.LongPoint;
|
||||
|
@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
|
|||
return new IndexSortSortedNumericDocValuesRangeQuery(
|
||||
field, lowerValue, upperValue, fallbackQuery);
|
||||
}
|
||||
|
||||
public void testCountWithBkd() throws IOException {
|
||||
String filedName = "field";
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
|
||||
iwc.setIndexSort(indexSort);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
|
||||
addDocWithBkd(writer, filedName, 6, 500);
|
||||
addDocWithBkd(writer, filedName, 5, 500);
|
||||
addDocWithBkd(writer, filedName, 8, 500);
|
||||
addDocWithBkd(writer, filedName, 9, 500);
|
||||
addDocWithBkd(writer, filedName, 7, 500);
|
||||
writer.flush();
|
||||
writer.forceMerge(1);
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
|
||||
Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
|
||||
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(1500, weight.count(context));
|
||||
}
|
||||
|
||||
fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
|
||||
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
|
||||
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(2000, weight.count(context));
|
||||
}
|
||||
|
||||
fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
|
||||
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
|
||||
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(1000, weight.count(context));
|
||||
}
|
||||
|
||||
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
|
||||
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
|
||||
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(2500, weight.count(context));
|
||||
}
|
||||
|
||||
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
|
||||
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
|
||||
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(0, weight.count(context));
|
||||
}
|
||||
|
||||
fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
|
||||
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
|
||||
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(0, weight.count(context));
|
||||
}
|
||||
|
||||
writer.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testRandomCountWithBkd() throws IOException {
|
||||
String filedName = "field";
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
|
||||
iwc.setIndexSort(indexSort);
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
|
||||
Random random = random();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
|
||||
}
|
||||
writer.flush();
|
||||
writer.forceMerge(1);
|
||||
IndexReader reader = writer.getReader();
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
int random1 = random.nextInt(1100);
|
||||
int random2 = random.nextInt(1100);
|
||||
int low = Math.min(random1, random2);
|
||||
int upper = Math.max(random1, random2);
|
||||
Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
|
||||
Query indexSortRangeQuery =
|
||||
new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
|
||||
Weight indexSortRangeQueryWeight =
|
||||
indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||
assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
|
||||
}
|
||||
}
|
||||
|
||||
writer.close();
|
||||
reader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
|
||||
throws IOException {
|
||||
for (int i = 0; i < repeat; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new SortedNumericDocValuesField(field, value));
|
||||
doc.add(new LongPoint(field, value));
|
||||
indexWriter.addDocument(doc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue