mirror of https://github.com/apache/lucene.git
LUCENE-7262: Leverage index statistics to make DocIdSetBuilder more efficient.
This commit is contained in:
parent
32f7d045d6
commit
e9f2ac0021
|
@ -71,8 +71,8 @@ Optimizations
|
|||
* LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than
|
||||
waiting on a lock. (Adrien Grand)
|
||||
|
||||
* LUCENE-7261, LUCENE-7264: Speed up DocIdSetBuilder (which is used by
|
||||
TermsQuery, multi-term queries and point queries). (Adrien Grand)
|
||||
* LUCENE-7261, LUCENE-7262, LUCENE-7264: Speed up DocIdSetBuilder (which is used
|
||||
by TermsQuery, multi-term queries and point queries). (Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
|
|
|
@ -166,7 +166,7 @@ final class MultiTermQueryConstantScoreWrapper<Q extends MultiTermQuery> extends
|
|||
}
|
||||
|
||||
// Too many terms: go back to the terms we already collected and start building the bit set
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
|
||||
if (collectedTerms.isEmpty() == false) {
|
||||
TermsEnum termsEnum2 = terms.iterator();
|
||||
for (TermAndState t : collectedTerms) {
|
||||
|
|
|
@ -130,7 +130,7 @@ public abstract class PointInSetQuery extends Query {
|
|||
throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
|
||||
}
|
||||
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
|
||||
|
||||
if (numDims == 1) {
|
||||
|
||||
|
|
|
@ -106,7 +106,7 @@ public abstract class PointRangeQuery extends Query {
|
|||
return new ConstantScoreWeight(this) {
|
||||
|
||||
private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException {
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
|
||||
|
||||
values.intersect(field,
|
||||
new IntersectVisitor() {
|
||||
|
|
|
@ -36,6 +36,9 @@ public class BitDocIdSet extends DocIdSet {
|
|||
* {@link BitSet} must not be modified afterwards.
|
||||
*/
|
||||
public BitDocIdSet(BitSet set, long cost) {
|
||||
if (cost < 0) {
|
||||
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
|
||||
}
|
||||
this.set = set;
|
||||
this.cost = cost;
|
||||
}
|
||||
|
|
|
@ -54,6 +54,9 @@ public class BitSetIterator extends DocIdSetIterator {
|
|||
|
||||
/** Sole constructor. */
|
||||
public BitSetIterator(BitSet bits, long cost) {
|
||||
if (cost < 0) {
|
||||
throw new IllegalArgumentException("cost must be >= 0, got " + cost);
|
||||
}
|
||||
this.bits = bits;
|
||||
this.length = bits.length();
|
||||
this.cost = cost;
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.util;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.packed.PackedInts;
|
||||
|
@ -65,18 +67,48 @@ public final class DocIdSetBuilder {
|
|||
|
||||
private final int maxDoc;
|
||||
private final int threshold;
|
||||
// pkg-private for testing
|
||||
final boolean multivalued;
|
||||
final double numValuesPerDoc;
|
||||
|
||||
private int[] buffer;
|
||||
private int bufferSize;
|
||||
|
||||
private FixedBitSet bitSet;
|
||||
|
||||
private long counter = -1;
|
||||
private BulkAdder adder = new BufferAdder();
|
||||
|
||||
/**
|
||||
* Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}.
|
||||
*/
|
||||
public DocIdSetBuilder(int maxDoc) {
|
||||
this(maxDoc, -1, -1);
|
||||
}
|
||||
|
||||
/** Create a {@link DocIdSetBuilder} instance that is optimized for
|
||||
* accumulating docs that match the given {@link Terms}. */
|
||||
public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException {
|
||||
this(maxDoc, terms.getDocCount(), terms.getSumDocFreq());
|
||||
}
|
||||
|
||||
/** Create a {@link DocIdSetBuilder} instance that is optimized for
|
||||
* accumulating docs that match the given {@link PointValues}. */
|
||||
public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException {
|
||||
this(maxDoc, values.getDocCount(field), values.size(field));
|
||||
}
|
||||
|
||||
DocIdSetBuilder(int maxDoc, int docCount, long valueCount) {
|
||||
this.maxDoc = maxDoc;
|
||||
this.multivalued = docCount < 0 || docCount != valueCount;
|
||||
this.numValuesPerDoc = (docCount < 0 || valueCount < 0)
|
||||
// assume one value per doc, this means the cost will be overestimated
|
||||
// if the docs are actually multi-valued
|
||||
? 1
|
||||
// otherwise compute from index stats
|
||||
: (double) valueCount / docCount;
|
||||
assert numValuesPerDoc >= 1;
|
||||
|
||||
// For ridiculously small sets, we'll just use a sorted int[]
|
||||
// maxDoc >>> 7 is a good value if you want to save memory, lower values
|
||||
// such as maxDoc >>> 11 should provide faster building but at the expense
|
||||
|
@ -94,6 +126,7 @@ public final class DocIdSetBuilder {
|
|||
for (int i = 0; i < bufferSize; ++i) {
|
||||
bitSet.set(buffer[i]);
|
||||
}
|
||||
counter = this.bufferSize;
|
||||
this.buffer = null;
|
||||
this.bufferSize = 0;
|
||||
this.adder = new FixedBitSetAdder(bitSet);
|
||||
|
@ -157,7 +190,10 @@ public final class DocIdSetBuilder {
|
|||
growBuffer((int) newLength);
|
||||
} else {
|
||||
upgradeToBitSet();
|
||||
counter += numDocs;
|
||||
}
|
||||
} else {
|
||||
counter += numDocs;
|
||||
}
|
||||
return adder;
|
||||
}
|
||||
|
@ -179,17 +215,32 @@ public final class DocIdSetBuilder {
|
|||
return l;
|
||||
}
|
||||
|
||||
private static boolean noDups(int[] a, int len) {
|
||||
for (int i = 1; i < len; ++i) {
|
||||
assert a[i-1] < a[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a {@link DocIdSet} from the accumulated doc IDs.
|
||||
*/
|
||||
public DocIdSet build() {
|
||||
try {
|
||||
if (bitSet != null) {
|
||||
return new BitDocIdSet(bitSet);
|
||||
assert counter >= 0;
|
||||
final long cost = Math.round(counter / numValuesPerDoc);
|
||||
return new BitDocIdSet(bitSet, cost);
|
||||
} else {
|
||||
LSBRadixSorter sorter = new LSBRadixSorter();
|
||||
sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, bufferSize);
|
||||
final int l = dedup(buffer, bufferSize);
|
||||
final int l;
|
||||
if (multivalued) {
|
||||
l = dedup(buffer, bufferSize);
|
||||
} else {
|
||||
assert noDups(buffer, bufferSize);
|
||||
l = bufferSize;
|
||||
}
|
||||
assert l <= bufferSize;
|
||||
buffer = ArrayUtil.grow(buffer, l + 1);
|
||||
buffer[l] = DocIdSetIterator.NO_MORE_DOCS;
|
||||
|
|
|
@ -19,6 +19,9 @@ package org.apache.lucene.util;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.PointValues;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
|
@ -158,4 +161,180 @@ public class TestDocIdSetBuilder extends LuceneTestCase {
|
|||
assertEquals(new BitDocIdSet(expected), builder.build());
|
||||
}
|
||||
|
||||
public void testLeverageStats() throws IOException {
|
||||
// single-valued points
|
||||
PointValues values = new DummyPointValues(42, 42);
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(100, values, "foo");
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertFalse(builder.multivalued);
|
||||
DocIdSetBuilder.BulkAdder adder = builder.grow(2);
|
||||
adder.add(5);
|
||||
adder.add(7);
|
||||
DocIdSet set = builder.build();
|
||||
assertTrue(set instanceof BitDocIdSet);
|
||||
assertEquals(2, set.iterator().cost());
|
||||
|
||||
// multi-valued points
|
||||
values = new DummyPointValues(42, 63);
|
||||
builder = new DocIdSetBuilder(100, values, "foo");
|
||||
assertEquals(1.5, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
adder = builder.grow(2);
|
||||
adder.add(5);
|
||||
adder.add(7);
|
||||
set = builder.build();
|
||||
assertTrue(set instanceof BitDocIdSet);
|
||||
assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice
|
||||
|
||||
// incomplete stats
|
||||
values = new DummyPointValues(42, -1);
|
||||
builder = new DocIdSetBuilder(100, values, "foo");
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
|
||||
values = new DummyPointValues(-1, 84);
|
||||
builder = new DocIdSetBuilder(100, values, "foo");
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
|
||||
// single-valued terms
|
||||
Terms terms = new DummyTerms(42, 42);
|
||||
builder = new DocIdSetBuilder(100, terms);
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertFalse(builder.multivalued);
|
||||
adder = builder.grow(2);
|
||||
adder.add(5);
|
||||
adder.add(7);
|
||||
set = builder.build();
|
||||
assertTrue(set instanceof BitDocIdSet);
|
||||
assertEquals(2, set.iterator().cost());
|
||||
|
||||
// multi-valued terms
|
||||
terms = new DummyTerms(42, 63);
|
||||
builder = new DocIdSetBuilder(100, terms);
|
||||
assertEquals(1.5, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
adder = builder.grow(2);
|
||||
adder.add(5);
|
||||
adder.add(7);
|
||||
set = builder.build();
|
||||
assertTrue(set instanceof BitDocIdSet);
|
||||
assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice
|
||||
|
||||
// incomplete stats
|
||||
terms = new DummyTerms(42, -1);
|
||||
builder = new DocIdSetBuilder(100, terms);
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
|
||||
terms = new DummyTerms(-1, 84);
|
||||
builder = new DocIdSetBuilder(100, terms);
|
||||
assertEquals(1d, builder.numValuesPerDoc, 0d);
|
||||
assertTrue(builder.multivalued);
|
||||
}
|
||||
|
||||
private static class DummyTerms extends Terms {
|
||||
|
||||
private final int docCount;
|
||||
private final long numValues;
|
||||
|
||||
DummyTerms(int docCount, long numValues) {
|
||||
this.docCount = docCount;
|
||||
this.numValues = numValues;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getSumDocFreq() throws IOException {
|
||||
return numValues;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDocCount() throws IOException {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasFreqs() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class DummyPointValues extends PointValues {
|
||||
|
||||
private final int docCount;
|
||||
private final long numPoints;
|
||||
|
||||
DummyPointValues(int docCount, long numPoints) {
|
||||
this.docCount = docCount;
|
||||
this.numPoints = numPoints;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void intersect(String fieldName, IntersectVisitor visitor) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getMinPackedValue(String fieldName) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getMaxPackedValue(String fieldName) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumDimensions(String fieldName) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getBytesPerDimension(String fieldName) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size(String fieldName) {
|
||||
return numPoints;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getDocCount(String fieldName) {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
@ -84,6 +85,7 @@ public class TermsQuery extends Query implements Accountable {
|
|||
// Same threshold as MultiTermQueryConstantScoreWrapper
|
||||
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
|
||||
|
||||
private final Set<String> fields;
|
||||
private final PrefixCodedTerms termData;
|
||||
private final int termDataHashCode; // cached hashcode of termData
|
||||
|
||||
|
@ -99,13 +101,16 @@ public class TermsQuery extends Query implements Accountable {
|
|||
ArrayUtil.timSort(sortedTerms);
|
||||
}
|
||||
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
|
||||
Set<String> fields = new HashSet<>();
|
||||
Term previous = null;
|
||||
for (Term term : sortedTerms) {
|
||||
if (term.equals(previous) == false) {
|
||||
fields.add(term.field());
|
||||
builder.add(term);
|
||||
}
|
||||
previous = term;
|
||||
}
|
||||
this.fields = Collections.unmodifiableSet(fields);
|
||||
termData = builder.finish();
|
||||
termDataHashCode = termData.hashCode();
|
||||
}
|
||||
|
@ -132,6 +137,7 @@ public class TermsQuery extends Query implements Accountable {
|
|||
builder.add(field, term);
|
||||
previous.copyBytes(term);
|
||||
}
|
||||
fields = Collections.singleton(field);
|
||||
termData = builder.finish();
|
||||
termDataHashCode = termData.hashCode();
|
||||
}
|
||||
|
@ -301,7 +307,15 @@ public class TermsQuery extends Query implements Accountable {
|
|||
matchingTerms.add(new TermAndState(field, termsEnum));
|
||||
} else {
|
||||
assert matchingTerms.size() == threshold;
|
||||
builder = new DocIdSetBuilder(reader.maxDoc());
|
||||
if (TermsQuery.this.fields.size() == 1) {
|
||||
// common case: all terms are in the same field
|
||||
// use an optimized builder that leverages terms stats to be more efficient
|
||||
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
|
||||
} else {
|
||||
// corner case: different fields
|
||||
// don't make assumptions about the docs we will get
|
||||
builder = new DocIdSetBuilder(reader.maxDoc());
|
||||
}
|
||||
docs = termsEnum.postings(docs, PostingsEnum.NONE);
|
||||
builder.add(docs);
|
||||
for (TermAndState t : matchingTerms) {
|
||||
|
|
|
@ -163,8 +163,8 @@ public class IntersectsRPTVerifyQuery extends Query {
|
|||
// TODO consider if IntersectsPrefixTreeQuery should simply do this and provide both sets
|
||||
|
||||
class IntersectsDifferentiatingVisitor extends VisitorTemplate {
|
||||
DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc);
|
||||
DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc);
|
||||
DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc, terms);
|
||||
DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc, terms);
|
||||
boolean approxIsEmpty = true;
|
||||
boolean exactIsEmpty = true;
|
||||
DocIdSet exactDocIdSet;
|
||||
|
|
|
@ -105,16 +105,20 @@ public abstract class AbstractPrefixTreeQuery extends Query {
|
|||
protected final LeafReaderContext context;
|
||||
protected final int maxDoc;
|
||||
|
||||
protected TermsEnum termsEnum;//remember to check for null!
|
||||
protected final Terms terms;
|
||||
protected final TermsEnum termsEnum;//remember to check for null!
|
||||
protected PostingsEnum postingsEnum;
|
||||
|
||||
public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException {
|
||||
this.context = context;
|
||||
LeafReader reader = context.reader();
|
||||
this.maxDoc = reader.maxDoc();
|
||||
Terms terms = reader.terms(fieldName);
|
||||
if (terms != null)
|
||||
terms = reader.terms(fieldName);
|
||||
if (terms != null) {
|
||||
this.termsEnum = terms.iterator();
|
||||
} else {
|
||||
this.termsEnum = null;
|
||||
}
|
||||
}
|
||||
|
||||
protected void collectDocs(BitSet bitSet) throws IOException {
|
||||
|
|
|
@ -55,8 +55,8 @@ public class IntersectsPrefixTreeQuery extends AbstractVisitingPrefixTreeQuery {
|
|||
private DocIdSetBuilder results;
|
||||
|
||||
@Override
|
||||
protected void start() {
|
||||
results = new DocIdSetBuilder(maxDoc);
|
||||
protected void start() throws IOException {
|
||||
results = new DocIdSetBuilder(maxDoc, terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -93,7 +93,7 @@ final class GeoPointTermQueryConstantScoreWrapper <Q extends GeoPointMultiTermQu
|
|||
|
||||
LeafReader reader = context.reader();
|
||||
// approximation (postfiltering has not yet been applied)
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc(), terms);
|
||||
// subset of documents that need no postfiltering, this is purely an optimization
|
||||
final BitSet preApproved;
|
||||
// dumb heuristic: if the field is really sparse, use a sparse impl
|
||||
|
|
|
@ -97,7 +97,7 @@ final class PointInGeo3DShapeQuery extends Query {
|
|||
assert xyzSolid.getRelationship(shape) == GeoArea.WITHIN || xyzSolid.getRelationship(shape) == GeoArea.OVERLAPS: "expected WITHIN (1) or OVERLAPS (2) but got " + xyzSolid.getRelationship(shape) + "; shape="+shape+"; XYZSolid="+xyzSolid;
|
||||
*/
|
||||
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
|
||||
DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field);
|
||||
|
||||
values.intersect(field, new PointInShapeIntersectVisitor(result, shape, shapeBounds));
|
||||
|
||||
|
|
|
@ -433,7 +433,7 @@ public final class SolrRangeQuery extends ExtendedQueryBase implements DocSetPro
|
|||
termsVisited += builder.add(termsEnum, base, liveDocs);
|
||||
*/
|
||||
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc());
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms);
|
||||
builder.grow((int)Math.min(Integer.MAX_VALUE,count));
|
||||
if (collectedTerms.isEmpty() == false) {
|
||||
TermsEnum termsEnum2 = terms.iterator();
|
||||
|
|
|
@ -226,7 +226,7 @@ public class GraphTermsQParserPlugin extends QParserPlugin {
|
|||
Terms terms = fields.terms(field);
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
PostingsEnum docs = null;
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc());
|
||||
DocIdSetBuilder builder = new DocIdSetBuilder(reader.maxDoc(), terms);
|
||||
for (int i=0; i<finalContexts.size(); i++) {
|
||||
TermContext termContext = finalContexts.get(i);
|
||||
TermState termState = termContext.get(context.ord);
|
||||
|
|
Loading…
Reference in New Issue