LUCENE-7264: Fewer conditionals in DocIdSetBuilder.

This commit is contained in:
Adrien Grand 2016-04-29 10:35:27 +02:00
parent 54b873c2f9
commit aa81ba8642
10 changed files with 86 additions and 54 deletions

View File

@ -78,8 +78,8 @@ Optimizations
* LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than * LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than
waiting on a lock. (Adrien Grand) waiting on a lock. (Adrien Grand)
* LUCENE-7261: Speed up LSBRadixSorter (which is used by TermsQuery, multi-term * LUCENE-7261, LUCENE-7264: Speed up DocIdSetBuilder (which is used by
queries and point queries). (Adrien Grand) TermsQuery, multi-term queries and point queries). (Adrien Grand)
Bug Fixes Bug Fixes

View File

@ -44,6 +44,7 @@ class SimpleTextBKDReader extends BKDReader {
in.seek(blockFP); in.seek(blockFP);
readLine(in, scratch); readLine(in, scratch);
int count = parseInt(scratch, BLOCK_COUNT); int count = parseInt(scratch, BLOCK_COUNT);
visitor.grow(count);
for(int i=0;i<count;i++) { for(int i=0;i<count;i++) {
readLine(in, scratch); readLine(in, scratch);
visitor.visit(parseInt(scratch, BLOCK_DOC_ID)); visitor.visit(parseInt(scratch, BLOCK_DOC_ID));
@ -65,6 +66,7 @@ class SimpleTextBKDReader extends BKDReader {
@Override @Override
protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException { protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
// NOTE: we don't do prefix coding, so we ignore commonPrefixLengths // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
assert scratchPackedValue.length == packedBytesLength; assert scratchPackedValue.length == packedBytesLength;
BytesRefBuilder scratch = new BytesRefBuilder(); BytesRefBuilder scratch = new BytesRefBuilder();

View File

@ -163,6 +163,7 @@ public abstract class PointInSetQuery extends Query {
private BytesRef nextQueryPoint; private BytesRef nextQueryPoint;
private final BytesRef scratch = new BytesRef(); private final BytesRef scratch = new BytesRef();
private final PrefixCodedTerms sortedPackedPoints; private final PrefixCodedTerms sortedPackedPoints;
private DocIdSetBuilder.BulkAdder adder;
public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) throws IOException { public MergePointVisitor(PrefixCodedTerms sortedPackedPoints, DocIdSetBuilder result) throws IOException {
this.result = result; this.result = result;
@ -174,12 +175,12 @@ public abstract class PointInSetQuery extends Query {
@Override @Override
public void grow(int count) { public void grow(int count) {
result.grow(count); adder = result.grow(count);
} }
@Override @Override
public void visit(int docID) { public void visit(int docID) {
result.add(docID); adder.add(docID);
} }
@Override @Override
@ -189,7 +190,7 @@ public abstract class PointInSetQuery extends Query {
int cmp = nextQueryPoint.compareTo(scratch); int cmp = nextQueryPoint.compareTo(scratch);
if (cmp == 0) { if (cmp == 0) {
// Query point equals index point, so collect and return // Query point equals index point, so collect and return
result.add(docID); adder.add(docID);
break; break;
} else if (cmp < 0) { } else if (cmp < 0) {
// Query point is before index point, so we move to next query point // Query point is before index point, so we move to next query point
@ -237,6 +238,7 @@ public abstract class PointInSetQuery extends Query {
private final DocIdSetBuilder result; private final DocIdSetBuilder result;
private final byte[] pointBytes; private final byte[] pointBytes;
private DocIdSetBuilder.BulkAdder adder;
public SinglePointVisitor(DocIdSetBuilder result) { public SinglePointVisitor(DocIdSetBuilder result) {
this.result = result; this.result = result;
@ -251,12 +253,12 @@ public abstract class PointInSetQuery extends Query {
@Override @Override
public void grow(int count) { public void grow(int count) {
result.grow(count); adder = result.grow(count);
} }
@Override @Override
public void visit(int docID) { public void visit(int docID) {
result.add(docID); adder.add(docID);
} }
@Override @Override
@ -264,7 +266,7 @@ public abstract class PointInSetQuery extends Query {
assert packedValue.length == pointBytes.length; assert packedValue.length == pointBytes.length;
if (Arrays.equals(packedValue, pointBytes)) { if (Arrays.equals(packedValue, pointBytes)) {
// The point for this doc matches the point we are querying on // The point for this doc matches the point we are querying on
result.add(docID); adder.add(docID);
} }
} }

View File

@ -111,14 +111,16 @@ public abstract class PointRangeQuery extends Query {
values.intersect(field, values.intersect(field,
new IntersectVisitor() { new IntersectVisitor() {
DocIdSetBuilder.BulkAdder adder;
@Override @Override
public void grow(int count) { public void grow(int count) {
result.grow(count); adder = result.grow(count);
} }
@Override @Override
public void visit(int docID) { public void visit(int docID) {
result.add(docID); adder.add(docID);
} }
@Override @Override
@ -136,7 +138,7 @@ public abstract class PointRangeQuery extends Query {
} }
// Doc is in-bounds // Doc is in-bounds
result.add(docID); adder.add(docID);
} }
@Override @Override

View File

@ -17,6 +17,7 @@
package org.apache.lucene.util; package org.apache.lucene.util;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
@ -26,17 +27,50 @@ import org.apache.lucene.util.packed.PackedInts;
* A builder of {@link DocIdSet}s. At first it uses a sparse structure to gather * A builder of {@link DocIdSet}s. At first it uses a sparse structure to gather
* documents, and then upgrades to a non-sparse bit set once enough hits match. * documents, and then upgrades to a non-sparse bit set once enough hits match.
* *
* To add documents, you first need to call {@link #grow} in order to reserve
* space, and then call {@link BulkAdder#add(int)} on the returned
* {@link BulkAdder}.
*
* @lucene.internal * @lucene.internal
*/ */
public final class DocIdSetBuilder { public final class DocIdSetBuilder {
/** Utility class to efficiently add many docs in one go.
* @see DocIdSetBuilder#grow */
public static abstract class BulkAdder {
public abstract void add(int doc);
}
private static class FixedBitSetAdder extends BulkAdder {
final FixedBitSet bitSet;
FixedBitSetAdder(FixedBitSet bitSet) {
this.bitSet = bitSet;
}
@Override
public void add(int doc) {
bitSet.set(doc);
}
}
private class BufferAdder extends BulkAdder {
@Override
public void add(int doc) {
buffer[bufferSize++] = doc;
}
}
private final int maxDoc; private final int maxDoc;
private final int threshold; private final int threshold;
private int[] buffer; private int[] buffer;
private int bufferSize; private int bufferSize;
private BitSet bitSet; private FixedBitSet bitSet;
private BulkAdder adder = new BufferAdder();
/** /**
* Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}. * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
@ -62,6 +96,7 @@ public final class DocIdSetBuilder {
} }
this.buffer = null; this.buffer = null;
this.bufferSize = 0; this.bufferSize = 0;
this.adder = new FixedBitSetAdder(bitSet);
} }
/** Grows the buffer to at least minSize, but never larger than threshold. */ /** Grows the buffer to at least minSize, but never larger than threshold. */
@ -69,9 +104,7 @@ public final class DocIdSetBuilder {
assert minSize < threshold; assert minSize < threshold;
if (buffer.length < minSize) { if (buffer.length < minSize) {
int nextSize = Math.min(threshold, ArrayUtil.oversize(minSize, Integer.BYTES)); int nextSize = Math.min(threshold, ArrayUtil.oversize(minSize, Integer.BYTES));
int[] newBuffer = new int[nextSize]; buffer = Arrays.copyOf(buffer, nextSize);
System.arraycopy(buffer, 0, newBuffer, 0, buffer.length);
buffer = newBuffer;
} }
} }
@ -86,7 +119,7 @@ public final class DocIdSetBuilder {
if (bitSet != null) { if (bitSet != null) {
bitSet.or(iter); bitSet.or(iter);
} else { } else {
while (true) { while (true) {
assert buffer.length <= threshold; assert buffer.length <= threshold;
final int end = buffer.length; final int end = buffer.length;
for (int i = bufferSize; i < end; ++i) { for (int i = bufferSize; i < end; ++i) {
@ -114,39 +147,19 @@ public final class DocIdSetBuilder {
} }
/** /**
* Reserve space so that this builder can hold {@code numDocs} MORE documents. * Reserve space and return a {@link BulkAdder} object that can be used to
* add up to {@code numDocs} documents.
*/ */
public void grow(int numDocs) { public BulkAdder grow(int numDocs) {
if (bitSet == null) { if (bitSet == null) {
final long newLength = bufferSize + numDocs; final long newLength = (long) bufferSize + numDocs;
if (newLength < threshold) { if (newLength < threshold) {
growBuffer((int) newLength); growBuffer((int) newLength);
} else { } else {
upgradeToBitSet(); upgradeToBitSet();
} }
} }
} return adder;
/**
* Add a document to this builder.
* NOTE: doc IDs do not need to be provided in order.
* NOTE: if you plan on adding several docs at once, look into using
* {@link #grow(int)} to reserve space.
*/
public void add(int doc) {
if (bitSet != null) {
bitSet.set(doc);
} else {
if (bufferSize + 1 > buffer.length) {
if (bufferSize + 1 >= threshold) {
upgradeToBitSet();
bitSet.set(doc);
return;
}
growBuffer(bufferSize+1);
}
buffer[bufferSize++] = doc;
}
} }
private static int dedup(int[] arr, int length) { private static int dedup(int[] arr, int length) {

View File

@ -40,11 +40,13 @@ public class TestReqExclBulkScorer extends LuceneTestCase {
DocIdSetBuilder exclBuilder = new DocIdSetBuilder(maxDoc); DocIdSetBuilder exclBuilder = new DocIdSetBuilder(maxDoc);
final int numIncludedDocs = TestUtil.nextInt(random(), 1, maxDoc); final int numIncludedDocs = TestUtil.nextInt(random(), 1, maxDoc);
final int numExcludedDocs = TestUtil.nextInt(random(), 1, maxDoc); final int numExcludedDocs = TestUtil.nextInt(random(), 1, maxDoc);
DocIdSetBuilder.BulkAdder reqAdder = reqBuilder.grow(numIncludedDocs);
for (int i = 0; i < numIncludedDocs; ++i) { for (int i = 0; i < numIncludedDocs; ++i) {
reqBuilder.add(random().nextInt(maxDoc)); reqAdder.add(random().nextInt(maxDoc));
} }
DocIdSetBuilder.BulkAdder exclAdder = exclBuilder.grow(numIncludedDocs);
for (int i = 0; i < numExcludedDocs; ++i) { for (int i = 0; i < numExcludedDocs; ++i) {
exclBuilder.add(random().nextInt(maxDoc)); exclAdder.add(random().nextInt(maxDoc));
} }
final DocIdSet req = reqBuilder.build(); final DocIdSet req = reqBuilder.build();

View File

@ -122,11 +122,14 @@ public class TestDocIdSetBuilder extends LuceneTestCase {
DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc); DocIdSetBuilder builder = new DocIdSetBuilder(maxDoc);
for (j = 0; j < array.length; ) { for (j = 0; j < array.length; ) {
final int l = TestUtil.nextInt(random(), 1, array.length - j); final int l = TestUtil.nextInt(random(), 1, array.length - j);
if (rarely()) { DocIdSetBuilder.BulkAdder adder = null;
builder.grow(l); for (int k = 0, budget = 0; k < l; ++k) {
} if (budget == 0 || rarely()) {
for (int k = 0; k < l; ++k) { budget = TestUtil.nextInt(random(), 1, l - k + 5);
builder.add(array[j++]); adder = builder.grow(budget);
}
adder.add(array[j++]);
budget--;
} }
} }

View File

@ -110,9 +110,11 @@ final class GeoPointTermQueryConstantScoreWrapper <Q extends GeoPointMultiTermQu
if (termsEnum.boundaryTerm()) { if (termsEnum.boundaryTerm()) {
builder.add(docs); builder.add(docs);
} else { } else {
int docId; int numDocs = termsEnum.docFreq();
while ((docId = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { DocIdSetBuilder.BulkAdder adder = builder.grow(numDocs);
builder.add(docId); for (int i = 0; i < numDocs; ++i) {
int docId = docs.nextDoc();
adder.add(docId);
preApproved.set(docId); preApproved.set(docId);
} }
} }

View File

@ -31,6 +31,7 @@ class PointInShapeIntersectVisitor implements IntersectVisitor {
private final DocIdSetBuilder hits; private final DocIdSetBuilder hits;
private final GeoShape shape; private final GeoShape shape;
private final XYZBounds shapeBounds; private final XYZBounds shapeBounds;
private DocIdSetBuilder.BulkAdder adder;
public PointInShapeIntersectVisitor(DocIdSetBuilder hits, GeoShape shape, XYZBounds shapeBounds) { public PointInShapeIntersectVisitor(DocIdSetBuilder hits, GeoShape shape, XYZBounds shapeBounds) {
this.hits = hits; this.hits = hits;
@ -40,12 +41,12 @@ class PointInShapeIntersectVisitor implements IntersectVisitor {
@Override @Override
public void grow(int count) { public void grow(int count) {
hits.grow(count); adder = hits.grow(count);
} }
@Override @Override
public void visit(int docID) { public void visit(int docID) {
hits.add(docID); adder.add(docID);
} }
@Override @Override
@ -58,7 +59,7 @@ class PointInShapeIntersectVisitor implements IntersectVisitor {
y >= shapeBounds.getMinimumY() && y <= shapeBounds.getMaximumY() && y >= shapeBounds.getMinimumY() && y <= shapeBounds.getMaximumY() &&
z >= shapeBounds.getMinimumZ() && z <= shapeBounds.getMaximumZ()) { z >= shapeBounds.getMinimumZ() && z <= shapeBounds.getMaximumZ()) {
if (shape.isWithin(x, y, z)) { if (shape.isWithin(x, y, z)) {
hits.add(docID); adder.add(docID);
} }
} }
} }

View File

@ -77,6 +77,7 @@ public final class AssertingPointsFormat extends PointsFormat {
final byte[] lastMaxPackedValue; final byte[] lastMaxPackedValue;
private Relation lastCompareResult; private Relation lastCompareResult;
private int lastDocID = -1; private int lastDocID = -1;
private int docBudget;
public AssertingIntersectVisitor(int numDims, int bytesPerDim, IntersectVisitor in) { public AssertingIntersectVisitor(int numDims, int bytesPerDim, IntersectVisitor in) {
this.in = in; this.in = in;
@ -93,6 +94,8 @@ public final class AssertingPointsFormat extends PointsFormat {
@Override @Override
public void visit(int docID) throws IOException { public void visit(int docID) throws IOException {
assert --docBudget >= 0 : "called add() more times than the last call to grow() reserved";
// This method, not filtering each hit, should only be invoked when the cell is inside the query shape: // This method, not filtering each hit, should only be invoked when the cell is inside the query shape:
assert lastCompareResult == Relation.CELL_INSIDE_QUERY; assert lastCompareResult == Relation.CELL_INSIDE_QUERY;
in.visit(docID); in.visit(docID);
@ -100,6 +103,7 @@ public final class AssertingPointsFormat extends PointsFormat {
@Override @Override
public void visit(int docID, byte[] packedValue) throws IOException { public void visit(int docID, byte[] packedValue) throws IOException {
assert --docBudget >= 0 : "called add() more times than the last call to grow() reserved";
// This method, to filter each doc's value, should only be invoked when the cell crosses the query shape: // This method, to filter each doc's value, should only be invoked when the cell crosses the query shape:
assert lastCompareResult == PointValues.Relation.CELL_CROSSES_QUERY; assert lastCompareResult == PointValues.Relation.CELL_CROSSES_QUERY;
@ -130,6 +134,7 @@ public final class AssertingPointsFormat extends PointsFormat {
@Override @Override
public void grow(int count) { public void grow(int count) {
in.grow(count); in.grow(count);
docBudget = count;
} }
@Override @Override