From 4fa2b29b200b2a92157396af3f485d38a4954e7a Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Mon, 2 May 2016 15:04:27 +0200 Subject: [PATCH] LUCENE-7262: Leverage index statistics to make DocIdSetBuilder more efficient. --- lucene/CHANGES.txt | 4 +- .../MultiTermQueryConstantScoreWrapper.java | 2 +- .../apache/lucene/search/PointInSetQuery.java | 2 +- .../apache/lucene/search/PointRangeQuery.java | 2 +- .../org/apache/lucene/util/BitDocIdSet.java | 3 + .../apache/lucene/util/BitSetIterator.java | 3 + .../apache/lucene/util/DocIdSetBuilder.java | 55 +++++- .../lucene/util/TestDocIdSetBuilder.java | 179 ++++++++++++++++++ .../org/apache/lucene/queries/TermsQuery.java | 16 +- .../composite/IntersectsRPTVerifyQuery.java | 4 +- .../prefix/AbstractPrefixTreeQuery.java | 10 +- .../prefix/IntersectsPrefixTreeQuery.java | 4 +- ...GeoPointTermQueryConstantScoreWrapper.java | 2 +- .../spatial3d/PointInGeo3DShapeQuery.java | 2 +- .../org/apache/solr/query/SolrRangeQuery.java | 2 +- .../solr/search/GraphTermsQParserPlugin.java | 2 +- 16 files changed, 273 insertions(+), 19 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a23a54779ca..41daaab407c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -78,8 +78,8 @@ Optimizations * LUCENE-7237: LRUQueryCache now prefers returning an uncached Scorer than waiting on a lock. (Adrien Grand) -* LUCENE-7261, LUCENE-7264: Speed up DocIdSetBuilder (which is used by - TermsQuery, multi-term queries and point queries). (Adrien Grand) +* LUCENE-7261, LUCENE-7262, LUCENE-7264: Speed up DocIdSetBuilder (which is used + by TermsQuery, multi-term queries and point queries). (Adrien Grand) Bug Fixes diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java index b7a27dcd781..eb7436f1247 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQueryConstantScoreWrapper.java @@ -166,7 +166,7 @@ final class MultiTermQueryConstantScoreWrapper extends } // Too many terms: go back to the terms we already collected and start building the bit set - DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc()); + DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc(), terms); if (collectedTerms.isEmpty() == false) { TermsEnum termsEnum2 = terms.iterator(); for (TermAndState t : collectedTerms) { diff --git a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java index 9be794cccc7..05490778797 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java @@ -130,7 +130,7 @@ public abstract class PointInSetQuery extends Query { throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim); } - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc()); + DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); if (numDims == 1) { diff --git a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java index cae2192c21b..abd4cbc20da 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PointRangeQuery.java @@ -106,7 +106,7 @@ public abstract class PointRangeQuery extends Query { return new ConstantScoreWeight(this) { private DocIdSet buildMatchingDocIdSet(LeafReader reader, PointValues values) throws IOException { - DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc()); + DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc(), values, field); values.intersect(field, new IntersectVisitor() { diff --git a/lucene/core/src/java/org/apache/lucene/util/BitDocIdSet.java b/lucene/core/src/java/org/apache/lucene/util/BitDocIdSet.java index 90d4f7b1a8a..c7387a584af 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitDocIdSet.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitDocIdSet.java @@ -36,6 +36,9 @@ public class BitDocIdSet extends DocIdSet { * {@link BitSet} must not be modified afterwards. */ public BitDocIdSet(BitSet set, long cost) { + if (cost < 0) { + throw new IllegalArgumentException("cost must be >= 0, got " + cost); + } this.set = set; this.cost = cost; } diff --git a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java index 9e181c6a163..1a51d801aac 100644 --- a/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java +++ b/lucene/core/src/java/org/apache/lucene/util/BitSetIterator.java @@ -54,6 +54,9 @@ public class BitSetIterator extends DocIdSetIterator { /** Sole constructor. */ public BitSetIterator(BitSet bits, long cost) { + if (cost < 0) { + throw new IllegalArgumentException("cost must be >= 0, got " + cost); + } this.bits = bits; this.length = bits.length(); this.cost = cost; diff --git a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java index b85a8f29fbe..a00fdca7e1d 100644 --- a/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/util/DocIdSetBuilder.java @@ -19,6 +19,8 @@ package org.apache.lucene.util; import java.io.IOException; import java.util.Arrays; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.Terms; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.packed.PackedInts; @@ -65,18 +67,48 @@ public final class DocIdSetBuilder { private final int maxDoc; private final int threshold; + // pkg-private for testing + final boolean multivalued; + final double numValuesPerDoc; private int[] buffer; private int bufferSize; private FixedBitSet bitSet; + + private long counter = -1; private BulkAdder adder = new BufferAdder(); /** * Create a builder that can contain doc IDs between {@code 0} and {@code maxDoc}. */ public DocIdSetBuilder(int maxDoc) { + this(maxDoc, -1, -1); + } + + /** Create a {@link DocIdSetBuilder} instance that is optimized for + * accumulating docs that match the given {@link Terms}. */ + public DocIdSetBuilder(int maxDoc, Terms terms) throws IOException { + this(maxDoc, terms.getDocCount(), terms.getSumDocFreq()); + } + + /** Create a {@link DocIdSetBuilder} instance that is optimized for + * accumulating docs that match the given {@link PointValues}. */ + public DocIdSetBuilder(int maxDoc, PointValues values, String field) throws IOException { + this(maxDoc, values.getDocCount(field), values.size(field)); + } + + DocIdSetBuilder(int maxDoc, int docCount, long valueCount) { this.maxDoc = maxDoc; + this.multivalued = docCount < 0 || docCount != valueCount; + this.numValuesPerDoc = (docCount < 0 || valueCount < 0) + // assume one value per doc, this means the cost will be overestimated + // if the docs are actually multi-valued + ? 1 + // otherwise compute from index stats + : (double) valueCount / docCount; + assert numValuesPerDoc >= 1; + // For ridiculously small sets, we'll just use a sorted int[] // maxDoc >>> 7 is a good value if you want to save memory, lower values // such as maxDoc >>> 11 should provide faster building but at the expense @@ -94,6 +126,7 @@ public final class DocIdSetBuilder { for (int i = 0; i < bufferSize; ++i) { bitSet.set(buffer[i]); } + counter = this.bufferSize; this.buffer = null; this.bufferSize = 0; this.adder = new FixedBitSetAdder(bitSet); @@ -157,7 +190,10 @@ public final class DocIdSetBuilder { growBuffer((int) newLength); } else { upgradeToBitSet(); + counter += numDocs; } + } else { + counter += numDocs; } return adder; } @@ -179,17 +215,32 @@ public final class DocIdSetBuilder { return l; } + private static boolean noDups(int[] a, int len) { + for (int i = 1; i < len; ++i) { + assert a[i-1] < a[i]; + } + return true; + } + /** * Build a {@link DocIdSet} from the accumulated doc IDs. */ public DocIdSet build() { try { if (bitSet != null) { - return new BitDocIdSet(bitSet); + assert counter >= 0; + final long cost = Math.round(counter / numValuesPerDoc); + return new BitDocIdSet(bitSet, cost); } else { LSBRadixSorter sorter = new LSBRadixSorter(); sorter.sort(PackedInts.bitsRequired(maxDoc - 1), buffer, bufferSize); - final int l = dedup(buffer, bufferSize); + final int l; + if (multivalued) { + l = dedup(buffer, bufferSize); + } else { + assert noDups(buffer, bufferSize); + l = bufferSize; + } assert l <= bufferSize; buffer = ArrayUtil.grow(buffer, l + 1); buffer[l] = DocIdSetIterator.NO_MORE_DOCS; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java index 8814be1f359..5dd8eb30697 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestDocIdSetBuilder.java @@ -19,6 +19,9 @@ package org.apache.lucene.util; import java.io.IOException; +import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; @@ -158,4 +161,180 @@ public class TestDocIdSetBuilder extends LuceneTestCase { assertEquals(new BitDocIdSet(expected), builder.build()); } + public void testLeverageStats() throws IOException { + // single-valued points + PointValues values = new DummyPointValues(42, 42); + DocIdSetBuilder builder = new DocIdSetBuilder(100, values, "foo"); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertFalse(builder.multivalued); + DocIdSetBuilder.BulkAdder adder = builder.grow(2); + adder.add(5); + adder.add(7); + DocIdSet set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(2, set.iterator().cost()); + + // multi-valued points + values = new DummyPointValues(42, 63); + builder = new DocIdSetBuilder(100, values, "foo"); + assertEquals(1.5, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + adder = builder.grow(2); + adder.add(5); + adder.add(7); + set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice + + // incomplete stats + values = new DummyPointValues(42, -1); + builder = new DocIdSetBuilder(100, values, "foo"); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + + values = new DummyPointValues(-1, 84); + builder = new DocIdSetBuilder(100, values, "foo"); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + + // single-valued terms + Terms terms = new DummyTerms(42, 42); + builder = new DocIdSetBuilder(100, terms); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertFalse(builder.multivalued); + adder = builder.grow(2); + adder.add(5); + adder.add(7); + set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(2, set.iterator().cost()); + + // multi-valued terms + terms = new DummyTerms(42, 63); + builder = new DocIdSetBuilder(100, terms); + assertEquals(1.5, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + adder = builder.grow(2); + adder.add(5); + adder.add(7); + set = builder.build(); + assertTrue(set instanceof BitDocIdSet); + assertEquals(1, set.iterator().cost()); // it thinks the same doc was added twice + + // incomplete stats + terms = new DummyTerms(42, -1); + builder = new DocIdSetBuilder(100, terms); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + + terms = new DummyTerms(-1, 84); + builder = new DocIdSetBuilder(100, terms); + assertEquals(1d, builder.numValuesPerDoc, 0d); + assertTrue(builder.multivalued); + } + + private static class DummyTerms extends Terms { + + private final int docCount; + private final long numValues; + + DummyTerms(int docCount, long numValues) { + this.docCount = docCount; + this.numValues = numValues; + } + + @Override + public TermsEnum iterator() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long size() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumTotalTermFreq() throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long getSumDocFreq() throws IOException { + return numValues; + } + + @Override + public int getDocCount() throws IOException { + return docCount; + } + + @Override + public boolean hasFreqs() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasOffsets() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPositions() { + throw new UnsupportedOperationException(); + } + + @Override + public boolean hasPayloads() { + throw new UnsupportedOperationException(); + } + + } + + private static class DummyPointValues extends PointValues { + + private final int docCount; + private final long numPoints; + + DummyPointValues(int docCount, long numPoints) { + this.docCount = docCount; + this.numPoints = numPoints; + } + + @Override + public void intersect(String fieldName, IntersectVisitor visitor) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getMinPackedValue(String fieldName) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public byte[] getMaxPackedValue(String fieldName) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getNumDimensions(String fieldName) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public int getBytesPerDimension(String fieldName) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long size(String fieldName) { + return numPoints; + } + + @Override + public int getDocCount(String fieldName) { + return docCount; + } + + } + } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java index 4c20571e1ba..eeee61f4699 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.Set; @@ -84,6 +85,7 @@ public class TermsQuery extends Query implements Accountable { // Same threshold as MultiTermQueryConstantScoreWrapper static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16; + private final Set fields; private final PrefixCodedTerms termData; private final int termDataHashCode; // cached hashcode of termData @@ -99,13 +101,16 @@ public class TermsQuery extends Query implements Accountable { ArrayUtil.timSort(sortedTerms); } PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + Set fields = new HashSet<>(); Term previous = null; for (Term term : sortedTerms) { if (term.equals(previous) == false) { + fields.add(term.field()); builder.add(term); } previous = term; } + this.fields = Collections.unmodifiableSet(fields); termData = builder.finish(); termDataHashCode = termData.hashCode(); } @@ -132,6 +137,7 @@ public class TermsQuery extends Query implements Accountable { builder.add(field, term); previous.copyBytes(term); } + fields = Collections.singleton(field); termData = builder.finish(); termDataHashCode = termData.hashCode(); } @@ -301,7 +307,15 @@ public class TermsQuery extends Query implements Accountable { matchingTerms.add(new TermAndState(field, termsEnum)); } else { assert matchingTerms.size() == threshold; - builder = new DocIdSetBuilder(reader.maxDoc()); + if (TermsQuery.this.fields.size() == 1) { + // common case: all terms are in the same field + // use an optimized builder that leverages terms stats to be more efficient + builder = new DocIdSetBuilder(reader.maxDoc(), terms); + } else { + // corner case: different fields + // don't make assumptions about the docs we will get + builder = new DocIdSetBuilder(reader.maxDoc()); + } docs = termsEnum.postings(docs, PostingsEnum.NONE); builder.add(docs); for (TermAndState t : matchingTerms) { diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java index f60bfeeaa13..886491e5454 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/composite/IntersectsRPTVerifyQuery.java @@ -163,8 +163,8 @@ public class IntersectsRPTVerifyQuery extends Query { // TODO consider if IntersectsPrefixTreeQuery should simply do this and provide both sets class IntersectsDifferentiatingVisitor extends VisitorTemplate { - DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc); - DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc); + DocIdSetBuilder approxBuilder = new DocIdSetBuilder(maxDoc, terms); + DocIdSetBuilder exactBuilder = new DocIdSetBuilder(maxDoc, terms); boolean approxIsEmpty = true; boolean exactIsEmpty = true; DocIdSet exactDocIdSet; diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/AbstractPrefixTreeQuery.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/AbstractPrefixTreeQuery.java index bcf486777ed..11cc22c36a5 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/AbstractPrefixTreeQuery.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/AbstractPrefixTreeQuery.java @@ -105,16 +105,20 @@ public abstract class AbstractPrefixTreeQuery extends Query { protected final LeafReaderContext context; protected final int maxDoc; - protected TermsEnum termsEnum;//remember to check for null! + protected final Terms terms; + protected final TermsEnum termsEnum;//remember to check for null! protected PostingsEnum postingsEnum; public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException { this.context = context; LeafReader reader = context.reader(); this.maxDoc = reader.maxDoc(); - Terms terms = reader.terms(fieldName); - if (terms != null) + terms = reader.terms(fieldName); + if (terms != null) { this.termsEnum = terms.iterator(); + } else { + this.termsEnum = null; + } } protected void collectDocs(BitSet bitSet) throws IOException { diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/IntersectsPrefixTreeQuery.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/IntersectsPrefixTreeQuery.java index 17c5a7ea1ec..c1d76dcb163 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/IntersectsPrefixTreeQuery.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/IntersectsPrefixTreeQuery.java @@ -55,8 +55,8 @@ public class IntersectsPrefixTreeQuery extends AbstractVisitingPrefixTreeQuery { private DocIdSetBuilder results; @Override - protected void start() { - results = new DocIdSetBuilder(maxDoc); + protected void start() throws IOException { + results = new DocIdSetBuilder(maxDoc, terms); } @Override diff --git a/lucene/spatial/src/java/org/apache/lucene/spatial/geopoint/search/GeoPointTermQueryConstantScoreWrapper.java b/lucene/spatial/src/java/org/apache/lucene/spatial/geopoint/search/GeoPointTermQueryConstantScoreWrapper.java index fb467e6317d..13ded15a9fd 100644 --- a/lucene/spatial/src/java/org/apache/lucene/spatial/geopoint/search/GeoPointTermQueryConstantScoreWrapper.java +++ b/lucene/spatial/src/java/org/apache/lucene/spatial/geopoint/search/GeoPointTermQueryConstantScoreWrapper.java @@ -93,7 +93,7 @@ final class GeoPointTermQueryConstantScoreWrapper