LUCENE-7262: Leverage index statistics to make DocIdSetBuilder more efficient.

This commit is contained in:
Adrien Grand 2016-05-02 15:04:27 +02:00
parent 32f7d045d6
commit e9f2ac0021
16 changed files with 273 additions and 19 deletions

View File

@ -71,8 +71,8 @@ Optimizations
* LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than
waiting on a lock. (Adrien Grand)
* LUCENE-7261, LUCENE-7264: Speed up DocIdSetBuilder (which is used by
TermsQuery, multi-term queries and point queries). (Adrien Grand)
* LUCENE-7261, LUCENE-7262, LUCENE-7264: Speed up DocIdSetBuilder (which is used
by TermsQuery, multi-term queries and point queries). (Adrien Grand)
Bug Fixes

View File

@ -166,7 +166,7 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
}
// Too many terms: go back to the terms we already collected and start building the bit set
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
if (collectedTerms.isEmpty() == false) {
TermsEnum termsEnum2 = terms.iterator();
for (TermAndState t : collectedTerms) {

View File

@ -130,7 +130,7 @@ public abstract class PointInSetQuery extends Query {
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
}
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
if (numDims == 1) {

View File

@ -106,7 +106,7 @@ public abstract class PointRangeQuery extends Query {
return new ConstantScoreWeight(this) {
private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException {
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
values.intersect(field,
new IntersectVisitor() {

View File

@ -36,6 +36,9 @@ public class BitDocIdSet extends DocIdSet {
* {@link BitSet} must not be modified afterwards.
*/
public BitDocIdSet(BitSet set, long cost) {
if (cost < 0) {
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
}
this.set = set;
this.cost = cost;
}

View File

@ -54,6 +54,9 @@ public class BitSetIterator extends DocIdSetIterator {
/** Sole constructor. */
public BitSetIterator(BitSet bits, long cost) {
if (cost < 0) {
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
}
this.bits = bits;
this.length = bits.length();
this.cost = cost;

View File

@ -19,6 +19,8 @@ package org.apache.lucene.util;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.packed.PackedInts;
@ -65,18 +67,48 @@ public final class DocIdSetBuilder {
private final int maxDoc;
private final int threshold;
// pkg-private for testing
final boolean multivalued;
final double numValuesPerDoc;
private int[] buffer;
private int bufferSize;
private FixedBitSet bitSet;
private long counter = -1;
private BulkAdder adder = new BufferAdder();
/**
* Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
*/
public DocIdSetBuilder(int maxDoc) {
this(maxDoc, -1, -1);
}
/** Create a {@link DocIdSetBuilder} instance that is optimized for
* accumulating docs that match the given {@link Terms}. */
public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
}
/** Create a {@link DocIdSetBuilder} instance that is optimized for
* accumulating docs that match the given {@link PointValues}. */
public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
this(maxDoc, values.getDocCount(field), values.size(field));
}
DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
this.maxDoc = maxDoc;
this.multivalued = docCount < 0 || docCount != valueCount;
this.numValuesPerDoc = (docCount < 0 || valueCount < 0)
// assume one value per doc, this means the cost will be overestimated
// if the docs are actually multi-valued
? 1
// otherwise compute from index stats
: (double) valueCount / docCount;
assert numValuesPerDoc >= 1;
// For ridiculously small sets, we'll just use a sorted int[]
// maxDoc >>> 7 is a good value if you want to save memory, lower values
// such as maxDoc >>> 11 should provide faster building but at the expense
@ -94,6 +126,7 @@ public final class DocIdSetBuilder {
for (int i = 0; i < bufferSize; ++i) {
bitSet.set(buffer[i]);
}
counter = this.bufferSize;
this.buffer = null;
this.bufferSize = 0;
this.adder = new FixedBitSetAdder(bitSet);
@ -157,7 +190,10 @@ public final class DocIdSetBuilder {
growBuffer((int) newLength);
} else {
upgradeToBitSet();
counter += numDocs;
}
} else {
counter += numDocs;
}
return adder;
}
@ -179,17 +215,32 @@ public final class DocIdSetBuilder {
return l;
}
private static boolean noDups(int[] a, int len) {
for (int i = 1; i < len; ++i) {
assert a[i-1] < a[i];
}
return true;
}
/**
* Build a {@link DocIdSet} from the accumulated doc IDs.
*/
public DocIdSet build() {
try {
if (bitSet != null) {
return new BitDocIdSet(bitSet);
assert counter >= 0;
final long cost = Math.round(counter / numValuesPerDoc);
return new BitDocIdSet(bitSet, cost);
} else {
LSBRadixSorter sorter = new LSBRadixSorter();
sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, bufferSize);
final int l = dedup(buffer, bufferSize);
final int l;
if (multivalued) {
l = dedup(buffer, bufferSize);
} else {
assert noDups(buffer, bufferSize);
l = bufferSize;
}
assert l <= bufferSize;
buffer = ArrayUtil.grow(buffer, l + 1);
buffer[l] = DocIdSetIterator.NO_MORE_DOCS;

View File

@ -19,6 +19,9 @@ package org.apache.lucene.util;
import java.io.IOException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
@ -158,4 +161,180 @@ public class TestDocIdSetBuilder extends LuceneTestCase {
assertEquals(new BitDocIdSet(expected), builder.build());
}
public void testLeverageStats() throws IOException {
// single-valued points
PointValues values = new DummyPointValues(42, 42);
DocIdSetBuilder builder = new DocIdSetBuilder(100, values, "foo");
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertFalse(builder.multivalued);
DocIdSetBuilder.BulkAdder adder = builder.grow(2);
adder.add(5);
adder.add(7);
DocIdSet set = builder.build();
assertTrue(set instanceof BitDocIdSet);
assertEquals(2, set.iterator().cost());
// multi-valued points
values = new DummyPointValues(42, 63);
builder = new DocIdSetBuilder(100, values, "foo");
assertEquals(1.5, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
adder = builder.grow(2);
adder.add(5);
adder.add(7);
set = builder.build();
assertTrue(set instanceof BitDocIdSet);
assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice
// incomplete stats
values = new DummyPointValues(42, -1);
builder = new DocIdSetBuilder(100, values, "foo");
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
values = new DummyPointValues(-1, 84);
builder = new DocIdSetBuilder(100, values, "foo");
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
// single-valued terms
Terms terms = new DummyTerms(42, 42);
builder = new DocIdSetBuilder(100, terms);
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertFalse(builder.multivalued);
adder = builder.grow(2);
adder.add(5);
adder.add(7);
set = builder.build();
assertTrue(set instanceof BitDocIdSet);
assertEquals(2, set.iterator().cost());
// multi-valued terms
terms = new DummyTerms(42, 63);
builder = new DocIdSetBuilder(100, terms);
assertEquals(1.5, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
adder = builder.grow(2);
adder.add(5);
adder.add(7);
set = builder.build();
assertTrue(set instanceof BitDocIdSet);
assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice
// incomplete stats
terms = new DummyTerms(42, -1);
builder = new DocIdSetBuilder(100, terms);
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
terms = new DummyTerms(-1, 84);
builder = new DocIdSetBuilder(100, terms);
assertEquals(1d, builder.numValuesPerDoc, 0d);
assertTrue(builder.multivalued);
}
private static class DummyTerms extends Terms {
private final int docCount;
private final long numValues;
DummyTerms(int docCount, long numValues) {
this.docCount = docCount;
this.numValues = numValues;
}
@Override
public TermsEnum iterator() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long size() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long getSumTotalTermFreq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long getSumDocFreq() throws IOException {
return numValues;
}
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override
public boolean hasFreqs() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasOffsets() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasPositions() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasPayloads() {
throw new UnsupportedOperationException();
}
}
private static class DummyPointValues extends PointValues {
private final int docCount;
private final long numPoints;
DummyPointValues(int docCount, long numPoints) {
this.docCount = docCount;
this.numPoints = numPoints;
}
@Override
public void intersect(String fieldName, IntersectVisitor visitor) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public byte[] getMinPackedValue(String fieldName) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public byte[] getMaxPackedValue(String fieldName) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int getNumDimensions(String fieldName) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int getBytesPerDimension(String fieldName) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long size(String fieldName) {
return numPoints;
}
@Override
public int getDocCount(String fieldName) {
return docCount;
}
}
}

View File

@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@ -84,6 +85,7 @@ public class TermsQuery extends Query implements Accountable {
// Same threshold as MultiTermQueryConstantScoreWrapper
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
private final Set<String> fields;
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
@ -99,13 +101,16 @@ public class TermsQuery extends Query implements Accountable {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
Set<String> fields = new HashSet<>();
Term previous = null;
for (Term term : sortedTerms) {
if (term.equals(previous) == false) {
fields.add(term.field());
builder.add(term);
}
previous = term;
}
this.fields = Collections.unmodifiableSet(fields);
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
@ -132,6 +137,7 @@ public class TermsQuery extends Query implements Accountable {
builder.add(field, term);
previous.copyBytes(term);
}
fields = Collections.singleton(field);
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
@ -301,7 +307,15 @@ public class TermsQuery extends Query implements Accountable {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
builder = new DocIdSetBuilder(reader.maxDoc());
if (TermsQuery.this.fields.size() == 1) {
// common case: all terms are in the same field
// use an optimized builder that leverages terms stats to be more efficient
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
} else {
// corner case: different fields
// don't make assumptions about the docs we will get
builder = new DocIdSetBuilder(reader.maxDoc());
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {

View File

@ -163,8 +163,8 @@ public class IntersectsRPTVerifyQuery extends Query {
// TODO consider if IntersectsPrefixTreeQuery should simply do this and provide both sets
class IntersectsDifferentiatingVisitor extends VisitorTemplate {
DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc);
DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc);
DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc, terms);
DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc, terms);
boolean approxIsEmpty = true;
boolean exactIsEmpty = true;
DocIdSet exactDocIdSet;

View File

@ -105,16 +105,20 @@ public abstract class AbstractPrefixTreeQuery extends Query {
protected final LeafReaderContext context;
protected final int maxDoc;
protected TermsEnum termsEnum;//remember to check for null!
protected final Terms terms;
protected final TermsEnum termsEnum;//remember to check for null!
protected PostingsEnum postingsEnum;
public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException {
this.context = context;
LeafReader reader = context.reader();
this.maxDoc = reader.maxDoc();
Terms terms = reader.terms(fieldName);
if (terms != null)
terms = reader.terms(fieldName);
if (terms != null) {
this.termsEnum = terms.iterator();
} else {
this.termsEnum = null;
}
}
protected void collectDocs(BitSet bitSet) throws IOException {

View File

@ -55,8 +55,8 @@ public class IntersectsPrefixTreeQuery extends AbstractVisitingPrefixTreeQuery {
private DocIdSetBuilder results;
@Override
protected void start() {
results = new DocIdSetBuilder(maxDoc);
protected void start() throws IOException {
results = new DocIdSetBuilder(maxDoc, terms);
}
@Override

View File

@ -93,7 +93,7 @@ final class GeoPointTermQueryConstantScoreWrapper <Q extends GeoPointMultiTermQu
LeafReader reader = context.reader();
// approximation (postfiltering has not yet been applied)
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc(), terms);
// subset of documents that need no postfiltering, this is purely an optimization
final BitSet preApproved;
// dumb heuristic: if the field is really sparse, use a sparse impl

View File

@ -97,7 +97,7 @@ final class PointInGeo3DShapeQuery extends Query {
assert xyzSolid.getRelationship(shape) == GeoArea.WITHIN || xyzSolid.getRelationship(shape) == GeoArea.OVERLAPS: "expected WITHIN (1) or OVERLAPS (2) but got " + xyzSolid.getRelationship(shape) + "; shape="+shape+"; XYZSolid="+xyzSolid;
*/
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
values.intersect(field, new PointInShapeIntersectVisitor(result, shape, shapeBounds));

View File

@ -433,7 +433,7 @@ public final class SolrRangeQuery extends ExtendedQueryBase implements DocSetPro
termsVisited += builder.add(termsEnum, base, liveDocs);
*/
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
builder.grow((int)Math.min(Integer.MAX_VALUE,count));
if (collectedTerms.isEmpty() == false) {
TermsEnum termsEnum2 = terms.iterator();

View File

@ -226,7 +226,7 @@ public class GraphTermsQParserPlugin extends QParserPlugin {
Terms terms = fields.terms(field);
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc(), terms);
for (int i=0; i<finalContexts.size(); i++) {
TermContext termContext = finalContexts.get(i);
TermState termState = termContext.get(context.ord);