mirror of https://github.com/apache/lucene.git
LUCENE-10425:speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocSetIdIterator construction using bkd binary search (#687)
This commit is contained in:
parent
bcc116057d
commit
5b24a233bd
|
@ -101,6 +101,9 @@ Improvements
|
||||||
* GITHUB#11785: Improve Tessellator performance by delaying calls to the method
|
* GITHUB#11785: Improve Tessellator performance by delaying calls to the method
|
||||||
#isIntersectingPolygon (Ignacio Vera)
|
#isIntersectingPolygon (Ignacio Vera)
|
||||||
|
|
||||||
|
* GITHUB#687: speed up IndexSortSortedNumericDocValuesRangeQuery#BoundedDocIdSetIterator
|
||||||
|
construction using bkd binary search. (Jianping Weng)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
* GITHUB#11726: Indexing term vectors on large documents could fail due to
|
* GITHUB#11726: Indexing term vectors on large documents could fail due to
|
||||||
|
|
|
@ -18,12 +18,17 @@ package org.apache.lucene.sandbox.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import org.apache.lucene.document.IntPoint;
|
||||||
|
import org.apache.lucene.document.LongPoint;
|
||||||
import org.apache.lucene.index.DocValues;
|
import org.apache.lucene.index.DocValues;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.NumericDocValues;
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
import org.apache.lucene.index.PointValues;
|
import org.apache.lucene.index.PointValues;
|
||||||
|
import org.apache.lucene.index.PointValues.IntersectVisitor;
|
||||||
|
import org.apache.lucene.index.PointValues.Relation;
|
||||||
import org.apache.lucene.index.SortedNumericDocValues;
|
import org.apache.lucene.index.SortedNumericDocValues;
|
||||||
import org.apache.lucene.search.ConstantScoreScorer;
|
import org.apache.lucene.search.ConstantScoreScorer;
|
||||||
import org.apache.lucene.search.ConstantScoreWeight;
|
import org.apache.lucene.search.ConstantScoreWeight;
|
||||||
|
@ -43,6 +48,8 @@ import org.apache.lucene.search.SortField;
|
||||||
import org.apache.lucene.search.SortField.Type;
|
import org.apache.lucene.search.SortField.Type;
|
||||||
import org.apache.lucene.search.SortedNumericSortField;
|
import org.apache.lucene.search.SortedNumericSortField;
|
||||||
import org.apache.lucene.search.Weight;
|
import org.apache.lucene.search.Weight;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A range query that can take advantage of the fact that the index is sorted to speed up execution.
|
* A range query that can take advantage of the fact that the index is sorted to speed up execution.
|
||||||
|
@ -214,12 +221,172 @@ public class IndexSortSortedNumericDocValuesRangeQuery extends Query {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the first document whose packed value is greater than or equal (if allowEqual is true)
|
||||||
|
* to the provided packed value or -1 if all packed values are smaller than the provided one,
|
||||||
|
*/
|
||||||
|
public final int nextDoc(PointValues values, byte[] packedValue, boolean allowEqual)
|
||||||
|
throws IOException {
|
||||||
|
assert values.getNumDimensions() == 1;
|
||||||
|
final int bytesPerDim = values.getBytesPerDimension();
|
||||||
|
final ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(bytesPerDim);
|
||||||
|
final Predicate<byte[]> biggerThan =
|
||||||
|
testPackedValue -> {
|
||||||
|
int cmp = comparator.compare(testPackedValue, 0, packedValue, 0);
|
||||||
|
return cmp > 0 || (cmp == 0 && allowEqual);
|
||||||
|
};
|
||||||
|
return nextDoc(values.getPointTree(), biggerThan);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int nextDoc(PointValues.PointTree pointTree, Predicate<byte[]> biggerThan)
|
||||||
|
throws IOException {
|
||||||
|
if (biggerThan.test(pointTree.getMaxPackedValue()) == false) {
|
||||||
|
// doc is before us
|
||||||
|
return -1;
|
||||||
|
} else if (pointTree.moveToChild()) {
|
||||||
|
// navigate down
|
||||||
|
do {
|
||||||
|
final int doc = nextDoc(pointTree, biggerThan);
|
||||||
|
if (doc != -1) {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
} while (pointTree.moveToSibling());
|
||||||
|
pointTree.moveToParent();
|
||||||
|
return -1;
|
||||||
|
} else {
|
||||||
|
// doc is in this leaf
|
||||||
|
final int[] doc = {-1};
|
||||||
|
pointTree.visitDocValues(
|
||||||
|
new IntersectVisitor() {
|
||||||
|
@Override
|
||||||
|
public void visit(int docID) {
|
||||||
|
throw new AssertionError("Invalid call to visit(docID)");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void visit(int docID, byte[] packedValue) {
|
||||||
|
if (doc[0] == -1 && biggerThan.test(packedValue)) {
|
||||||
|
doc[0] = docID;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
|
||||||
|
return Relation.CELL_CROSSES_QUERY;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return doc[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean matchNone(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
|
||||||
|
throws IOException {
|
||||||
|
final ByteArrayComparator comparator =
|
||||||
|
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
|
||||||
|
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
|
||||||
|
int offset = dim * points.getBytesPerDimension();
|
||||||
|
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0
|
||||||
|
|| comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean matchAll(PointValues points, byte[] queryLowerPoint, byte[] queryUpperPoint)
|
||||||
|
throws IOException {
|
||||||
|
final ByteArrayComparator comparator =
|
||||||
|
ArrayUtil.getUnsignedComparator(points.getBytesPerDimension());
|
||||||
|
for (int dim = 0; dim < points.getNumDimensions(); dim++) {
|
||||||
|
int offset = dim * points.getBytesPerDimension();
|
||||||
|
if (comparator.compare(points.getMinPackedValue(), offset, queryUpperPoint, offset) > 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (comparator.compare(points.getMaxPackedValue(), offset, queryLowerPoint, offset) < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (comparator.compare(points.getMinPackedValue(), offset, queryLowerPoint, offset) < 0
|
||||||
|
|| comparator.compare(points.getMaxPackedValue(), offset, queryUpperPoint, offset) > 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private BoundedDocIdSetIterator getDocIdSetIteratorOrNullFromBkd(
|
||||||
|
LeafReaderContext context, DocIdSetIterator delegate) throws IOException {
|
||||||
|
Sort indexSort = context.reader().getMetaData().getSort();
|
||||||
|
if (indexSort != null
|
||||||
|
&& indexSort.getSort().length > 0
|
||||||
|
&& indexSort.getSort()[0].getField().equals(field)
|
||||||
|
&& indexSort.getSort()[0].getReverse() == false) {
|
||||||
|
PointValues points = context.reader().getPointValues(field);
|
||||||
|
if (points == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (points.getNumDimensions() != 1) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (points.getBytesPerDimension() != Long.BYTES
|
||||||
|
&& points.getBytesPerDimension() != Integer.BYTES) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Each doc that has points has exactly one point.
|
||||||
|
if (points.size() == points.getDocCount()) {
|
||||||
|
|
||||||
|
byte[] queryLowerPoint;
|
||||||
|
byte[] queryUpperPoint;
|
||||||
|
if (points.getBytesPerDimension() == Integer.BYTES) {
|
||||||
|
queryLowerPoint = IntPoint.pack((int) lowerValue).bytes;
|
||||||
|
queryUpperPoint = IntPoint.pack((int) upperValue).bytes;
|
||||||
|
} else {
|
||||||
|
queryLowerPoint = LongPoint.pack(lowerValue).bytes;
|
||||||
|
queryUpperPoint = LongPoint.pack(upperValue).bytes;
|
||||||
|
}
|
||||||
|
if (lowerValue > upperValue || matchNone(points, queryLowerPoint, queryUpperPoint)) {
|
||||||
|
return new BoundedDocIdSetIterator(0, 0, null);
|
||||||
|
}
|
||||||
|
int minDocId, maxDocId;
|
||||||
|
if (matchAll(points, queryLowerPoint, queryUpperPoint)) {
|
||||||
|
minDocId = 0;
|
||||||
|
maxDocId = context.reader().maxDoc();
|
||||||
|
} else {
|
||||||
|
// >=queryLowerPoint
|
||||||
|
minDocId = nextDoc(points, queryLowerPoint, true);
|
||||||
|
|
||||||
|
if (minDocId == -1) {
|
||||||
|
return new BoundedDocIdSetIterator(0, 0, null);
|
||||||
|
}
|
||||||
|
// >queryUpperPoint,
|
||||||
|
maxDocId = nextDoc(points, queryUpperPoint, false);
|
||||||
|
if (maxDocId == -1) {
|
||||||
|
maxDocId = context.reader().maxDoc();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((points.getDocCount() == context.reader().maxDoc())) {
|
||||||
|
return new BoundedDocIdSetIterator(minDocId, maxDocId, null);
|
||||||
|
} else {
|
||||||
|
return new BoundedDocIdSetIterator(minDocId, maxDocId, delegate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
|
private BoundedDocIdSetIterator getDocIdSetIteratorOrNull(LeafReaderContext context)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
SortedNumericDocValues sortedNumericValues =
|
SortedNumericDocValues sortedNumericValues =
|
||||||
DocValues.getSortedNumeric(context.reader(), field);
|
DocValues.getSortedNumeric(context.reader(), field);
|
||||||
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
|
NumericDocValues numericValues = DocValues.unwrapSingleton(sortedNumericValues);
|
||||||
if (numericValues != null) {
|
if (numericValues != null) {
|
||||||
|
BoundedDocIdSetIterator iterator = getDocIdSetIteratorOrNullFromBkd(context, numericValues);
|
||||||
|
if (iterator != null) {
|
||||||
|
return iterator;
|
||||||
|
}
|
||||||
Sort indexSort = context.reader().getMetaData().getSort();
|
Sort indexSort = context.reader().getMetaData().getSort();
|
||||||
if (indexSort != null
|
if (indexSort != null
|
||||||
&& indexSort.getSort().length > 0
|
&& indexSort.getSort().length > 0
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.sandbox.search;
|
||||||
import static org.hamcrest.CoreMatchers.instanceOf;
|
import static org.hamcrest.CoreMatchers.instanceOf;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Random;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.LongPoint;
|
import org.apache.lucene.document.LongPoint;
|
||||||
|
@ -641,4 +642,115 @@ public class TestIndexSortSortedNumericDocValuesRangeQuery extends LuceneTestCas
|
||||||
return new IndexSortSortedNumericDocValuesRangeQuery(
|
return new IndexSortSortedNumericDocValuesRangeQuery(
|
||||||
field, lowerValue, upperValue, fallbackQuery);
|
field, lowerValue, upperValue, fallbackQuery);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCountWithBkd() throws IOException {
|
||||||
|
String filedName = "field";
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
|
||||||
|
iwc.setIndexSort(indexSort);
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
addDocWithBkd(writer, filedName, 6, 500);
|
||||||
|
addDocWithBkd(writer, filedName, 5, 500);
|
||||||
|
addDocWithBkd(writer, filedName, 8, 500);
|
||||||
|
addDocWithBkd(writer, filedName, 9, 500);
|
||||||
|
addDocWithBkd(writer, filedName, 7, 500);
|
||||||
|
writer.flush();
|
||||||
|
writer.forceMerge(1);
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
Query fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 8);
|
||||||
|
Query query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 8, fallbackQuery);
|
||||||
|
Weight weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(1500, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackQuery = LongPoint.newRangeQuery(filedName, 6, 10);
|
||||||
|
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 6, 10, fallbackQuery);
|
||||||
|
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(2000, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackQuery = LongPoint.newRangeQuery(filedName, 4, 6);
|
||||||
|
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 4, 6, fallbackQuery);
|
||||||
|
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(1000, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 10);
|
||||||
|
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 10, fallbackQuery);
|
||||||
|
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(2500, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackQuery = LongPoint.newRangeQuery(filedName, 2, 3);
|
||||||
|
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 2, 3, fallbackQuery);
|
||||||
|
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(0, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
fallbackQuery = LongPoint.newRangeQuery(filedName, 10, 11);
|
||||||
|
query = new IndexSortSortedNumericDocValuesRangeQuery(filedName, 10, 11, fallbackQuery);
|
||||||
|
weight = query.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(0, weight.count(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomCountWithBkd() throws IOException {
|
||||||
|
String filedName = "field";
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
|
||||||
|
Sort indexSort = new Sort(new SortedNumericSortField(filedName, SortField.Type.LONG, false));
|
||||||
|
iwc.setIndexSort(indexSort);
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
|
||||||
|
Random random = random();
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
addDocWithBkd(writer, filedName, random.nextInt(1000), random.nextInt(1000));
|
||||||
|
}
|
||||||
|
writer.flush();
|
||||||
|
writer.forceMerge(1);
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
int random1 = random.nextInt(1100);
|
||||||
|
int random2 = random.nextInt(1100);
|
||||||
|
int low = Math.min(random1, random2);
|
||||||
|
int upper = Math.max(random1, random2);
|
||||||
|
Query rangeQuery = LongPoint.newRangeQuery(filedName, low, upper);
|
||||||
|
Query indexSortRangeQuery =
|
||||||
|
new IndexSortSortedNumericDocValuesRangeQuery(filedName, low, upper, rangeQuery);
|
||||||
|
Weight indexSortRangeQueryWeight =
|
||||||
|
indexSortRangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
Weight rangeQueryWeight = rangeQuery.createWeight(searcher, ScoreMode.COMPLETE, 1.0f);
|
||||||
|
for (LeafReaderContext context : searcher.getLeafContexts()) {
|
||||||
|
assertEquals(rangeQueryWeight.count(context), indexSortRangeQueryWeight.count(context));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.close();
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDocWithBkd(RandomIndexWriter indexWriter, String field, long value, int repeat)
|
||||||
|
throws IOException {
|
||||||
|
for (int i = 0; i < repeat; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new SortedNumericDocValuesField(field, value));
|
||||||
|
doc.add(new LongPoint(field, value));
|
||||||
|
indexWriter.addDocument(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue