LUCENE-7828: Speed up range queries on range fields by improving how we compute the relation between the query and inner nodes of the BKD tree.

This commit is contained in:
Adrien Grand 2017-06-07 17:49:33 +02:00
parent b25dda0b20
commit 528899d845
2 changed files with 232 additions and 151 deletions

View File

@ -134,6 +134,12 @@ Improvements
* LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss) * LUCENE-7841: Normalize ґ to г in Ukrainian analyzer. (Andriy Rysin via Dawid Weiss)
Optimizations
* LUCENE-7828: Speed up range queries on range fields by improving how we
compute the relation between the query and inner nodes of the BKD tree.
(Adrien Grand)
======================= Lucene 6.6.0 ======================= ======================= Lucene 6.6.0 =======================
New Features New Features

View File

@ -19,22 +19,20 @@ package org.apache.lucene.document;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Objects; import java.util.Objects;
import java.util.function.IntPredicate;
import java.util.function.Predicate;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.Weight; import org.apache.lucene.search.Weight;
import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
@ -60,13 +58,167 @@ abstract class RangeFieldQuery extends Query {
/** Used by {@code RangeFieldQuery} to check how each internal or leaf node relates to the query. */ /** Used by {@code RangeFieldQuery} to check how each internal or leaf node relates to the query. */
enum QueryType { enum QueryType {
/** Use this for intersects queries. */ /** Use this for intersects queries. */
INTERSECTS, INTERSECTS {
@Override
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, minPackedValue, minOffset) < 0
|| StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, maxPackedValue, maxOffset) > 0) {
// disjoint
return Relation.CELL_OUTSIDE_QUERY;
}
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, maxPackedValue, minOffset) >= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, minPackedValue, maxOffset) <= 0) {
return Relation.CELL_INSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
@Override
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
return StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, packedValue, minOffset) >= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, packedValue, maxOffset) <= 0;
}
},
/** Use this for within queries. */ /** Use this for within queries. */
WITHIN, WITHIN {
@Override
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, minPackedValue, maxOffset) < 0
|| StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, maxPackedValue, minOffset) > 0) {
// all ranges have at least one point outside of the query
return Relation.CELL_OUTSIDE_QUERY;
}
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, maxPackedValue, maxOffset) >= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, minPackedValue, minOffset) <= 0) {
return Relation.CELL_INSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
@Override
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
return StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, packedValue, minOffset) <= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, packedValue, maxOffset) >= 0;
}
},
/** Use this for contains */ /** Use this for contains */
CONTAINS, CONTAINS {
@Override
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, maxPackedValue, maxOffset) > 0
|| StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, minPackedValue, minOffset) < 0) {
// all ranges are either less than the query max or greater than the query min
return Relation.CELL_OUTSIDE_QUERY;
}
if (StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, minPackedValue, maxOffset) <= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, maxPackedValue, minOffset) >= 0) {
return Relation.CELL_INSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
@Override
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim, int dim) {
int minOffset = dim * bytesPerDim;
int maxOffset = minOffset + bytesPerDim * numDims;
return StringHelper.compare(bytesPerDim, queryPackedValue, minOffset, packedValue, minOffset) >= 0
&& StringHelper.compare(bytesPerDim, queryPackedValue, maxOffset, packedValue, maxOffset) <= 0;
}
},
/** Use this for crosses queries */ /** Use this for crosses queries */
CROSSES CROSSES {
@Override
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int numDims, int bytesPerDim, int dim) {
throw new UnsupportedOperationException();
}
@Override
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim, int dim) {
throw new UnsupportedOperationException();
}
@Override
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue,
int numDims, int bytesPerDim) {
Relation intersectRelation = QueryType.INTERSECTS.compare(queryPackedValue, minPackedValue, maxPackedValue, numDims, bytesPerDim);
if (intersectRelation == Relation.CELL_OUTSIDE_QUERY) {
return Relation.CELL_OUTSIDE_QUERY;
}
Relation withinRelation = QueryType.WITHIN.compare(queryPackedValue, minPackedValue, maxPackedValue, numDims, bytesPerDim);
if (withinRelation == Relation.CELL_INSIDE_QUERY) {
return Relation.CELL_OUTSIDE_QUERY;
}
if (intersectRelation == Relation.CELL_INSIDE_QUERY && withinRelation == Relation.CELL_OUTSIDE_QUERY) {
return Relation.CELL_INSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim) {
return INTERSECTS.matches(queryPackedValue, packedValue, numDims, bytesPerDim)
&& WITHIN.matches(queryPackedValue, packedValue, numDims, bytesPerDim) == false;
}
};
abstract Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue, int numDims, int bytesPerDim, int dim);
Relation compare(byte[] queryPackedValue, byte[] minPackedValue, byte[] maxPackedValue, int numDims, int bytesPerDim) {
boolean inside = true;
for (int dim = 0; dim < numDims; ++dim) {
Relation relation = compare(queryPackedValue, minPackedValue, maxPackedValue, numDims, bytesPerDim, dim);
if (relation == Relation.CELL_OUTSIDE_QUERY) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (relation != Relation.CELL_INSIDE_QUERY) {
inside = false;
}
}
return inside ? Relation.CELL_INSIDE_QUERY : Relation.CELL_CROSSES_QUERY;
}
abstract boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim, int dim);
boolean matches(byte[] queryPackedValue, byte[] packedValue, int numDims, int bytesPerDim) {
for (int dim = 0; dim < numDims; ++dim) {
if (matches(queryPackedValue, packedValue, numDims, bytesPerDim, dim) == false) {
return false;
}
}
return true;
}
} }
/** /**
@ -111,54 +263,33 @@ abstract class RangeFieldQuery extends Query {
@Override @Override
public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { public final Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) { return new ConstantScoreWeight(this, boost) {
final RangeFieldComparator target = new RangeFieldComparator();
private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException { private IntersectVisitor getIntersectVisitor(DocIdSetBuilder result) {
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); return new IntersectVisitor() {
values.intersect( DocIdSetBuilder.BulkAdder adder;
new IntersectVisitor() { @Override
DocIdSetBuilder.BulkAdder adder; public void grow(int count) {
@Override adder = result.grow(count);
public void grow(int count) { }
adder = result.grow(count); @Override
} public void visit(int docID) throws IOException {
@Override adder.add(docID);
public void visit(int docID) throws IOException { }
adder.add(docID); @Override
} public void visit(int docID, byte[] leaf) throws IOException {
@Override if (queryType.matches(ranges, leaf, numDims, bytesPerDim)) {
public void visit(int docID, byte[] leaf) throws IOException { adder.add(docID);
if (target.matches(leaf)) { }
adder.add(docID); }
} @Override
} public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
@Override return queryType.compare(ranges, minPackedValue, maxPackedValue, numDims, bytesPerDim);
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { }
return compareRange(minPackedValue, maxPackedValue); };
}
});
return result.build();
}
private Relation compareRange(byte[] minPackedValue, byte[] maxPackedValue) {
byte[] node = getInternalRange(minPackedValue, maxPackedValue);
// compute range relation for BKD traversal
if (target.intersects(node) == false) {
return Relation.CELL_OUTSIDE_QUERY;
} else if (target.within(node)) {
// target within cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
} else if (target.contains(node)) {
// target contains cell; add iff queryType is not a CONTAINS or CROSSES query:
return (queryType == QueryType.CONTAINS || queryType == QueryType.CROSSES) ?
Relation.CELL_OUTSIDE_QUERY : Relation.CELL_INSIDE_QUERY;
}
// target intersects cell; continue traversing:
return Relation.CELL_CROSSES_QUERY;
} }
@Override @Override
public Scorer scorer(LeafReaderContext context) throws IOException { public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader(); LeafReader reader = context.reader();
PointValues values = reader.getPointValues(field); PointValues values = reader.getPointValues(field);
if (values == null) { if (values == null) {
@ -173,115 +304,59 @@ abstract class RangeFieldQuery extends Query {
checkFieldInfo(fieldInfo); checkFieldInfo(fieldInfo);
boolean allDocsMatch = false; boolean allDocsMatch = false;
if (values.getDocCount() == reader.maxDoc() if (values.getDocCount() == reader.maxDoc()
&& compareRange(values.getMinPackedValue(), values.getMaxPackedValue()) == Relation.CELL_INSIDE_QUERY) { && queryType.compare(ranges, values.getMinPackedValue(), values.getMaxPackedValue(), numDims, bytesPerDim) == Relation.CELL_INSIDE_QUERY) {
allDocsMatch = true; allDocsMatch = true;
} }
DocIdSetIterator iterator = allDocsMatch == true ? final Weight weight = this;
DocIdSetIterator.all(reader.maxDoc()) : buildMatchingDocIdSet(reader, values).iterator(); if (allDocsMatch) {
return new ConstantScoreScorer(this, score(), iterator); return new ScorerSupplier() {
} @Override
public Scorer get(boolean randomAccess) {
return new ConstantScoreScorer(weight, score(), DocIdSetIterator.all(reader.maxDoc()));
}
/** get an encoded byte representation of the internal node; this is @Override
* the lower half of the min array and the upper half of the max array */ public long cost() {
private byte[] getInternalRange(byte[] min, byte[] max) { return reader.maxDoc();
byte[] range = new byte[min.length]; }
final int dimSize = numDims * bytesPerDim; };
System.arraycopy(min, 0, range, 0, dimSize); } else {
System.arraycopy(max, dimSize, range, dimSize, dimSize); return new ScorerSupplier() {
return range;
}
};
}
/** final DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
* RangeFieldComparator class provides the core comparison logic for accepting or rejecting indexed final IntersectVisitor visitor = getIntersectVisitor(result);
* {@code RangeField} types based on the defined query range and relation. long cost = -1;
*/
class RangeFieldComparator {
final Predicate<byte[]> predicate;
/** constructs the comparator based on the query type */ @Override
RangeFieldComparator() { public Scorer get(boolean randomAccess) throws IOException {
switch (queryType) { values.intersect(visitor);
case INTERSECTS: DocIdSetIterator iterator = result.build().iterator();
predicate = this::intersects; return new ConstantScoreScorer(weight, score(), iterator);
break; }
case WITHIN:
predicate = this::contains;
break;
case CONTAINS:
predicate = this::within;
break;
case CROSSES:
// crosses first checks intersection (disjoint automatic fails),
// then ensures the query doesn't wholly contain the leaf:
predicate = (byte[] leaf) -> this.intersects(leaf)
&& this.contains(leaf) == false;
break;
default:
throw new IllegalArgumentException("invalid queryType [" + queryType + "] found.");
}
}
/** determines if the candidate range matches the query request */ @Override
private boolean matches(final byte[] candidate) { public long cost() {
return (Arrays.equals(ranges, candidate) && queryType != QueryType.CROSSES) if (cost == -1) {
|| predicate.test(candidate); // Computing the cost may be expensive, so only do it if necessary
} cost = values.estimatePointCount(visitor);
assert cost >= 0;
/** check if query intersects candidate range */ }
private boolean intersects(final byte[] candidate) { return cost;
return relate((int d) -> compareMinMax(candidate, d) > 0 || compareMaxMin(candidate, d) < 0); }
} };
/** check if query is within candidate range */
private boolean within(final byte[] candidate) {
return relate((int d) -> compareMinMin(candidate, d) < 0 || compareMaxMax(candidate, d) > 0);
}
/** check if query contains candidate range */
private boolean contains(final byte[] candidate) {
return relate((int d) -> compareMinMin(candidate, d) > 0 || compareMaxMax(candidate, d) < 0);
}
/** internal method used by each relation method to test range relation logic */
private boolean relate(IntPredicate predicate) {
for (int d=0; d<numDims; ++d) {
if (predicate.test(d)) {
return false;
} }
} }
return true;
}
/** compare the encoded min value (for the defined query dimension) with the encoded min value in the byte array */ @Override
private int compareMinMin(byte[] b, int dimension) { public Scorer scorer(LeafReaderContext context) throws IOException {
// convert dimension to offset: ScorerSupplier scorerSupplier = scorerSupplier(context);
dimension *= bytesPerDim; if (scorerSupplier == null) {
return StringHelper.compare(bytesPerDim, ranges, dimension, b, dimension); return null;
} }
return scorerSupplier.get(false);
/** compare the encoded min value (for the defined query dimension) with the encoded max value in the byte array */ }
private int compareMinMax(byte[] b, int dimension) { };
// convert dimension to offset:
dimension *= bytesPerDim;
return StringHelper.compare(bytesPerDim, ranges, dimension, b, numDims * bytesPerDim + dimension);
}
/** compare the encoded max value (for the defined query dimension) with the encoded min value in the byte array */
private int compareMaxMin(byte[] b, int dimension) {
// convert dimension to offset:
dimension *= bytesPerDim;
return StringHelper.compare(bytesPerDim, ranges, numDims * bytesPerDim + dimension, b, dimension);
}
/** compare the encoded max value (for the defined query dimension) with the encoded max value in the byte array */
private int compareMaxMax(byte[] b, int dimension) {
// convert dimension to max offset:
dimension = numDims * bytesPerDim + dimension * bytesPerDim;
return StringHelper.compare(bytesPerDim, ranges, dimension, b, dimension);
}
} }
@Override @Override