Search: Avoid calling DocIdSets.toSafeBits.

This method is heavy as it builds a bitset out of a DocIdSet in order to be
able to provide random-access. Now that Lucene has removed out-of-order scoring
true random-access is very rarely needed and we could instead return an Bits
instance that wraps the iterator. Ideally, we would use the DISI API directly
but I have to admit that the Bits API is more friendly.

Close #9546
This commit is contained in:
Adrien Grand 2015-02-03 14:55:41 +01:00
parent e5b174ff77
commit 8540a863aa
8 changed files with 113 additions and 11 deletions

View File

@ -30,6 +30,8 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.RoaringDocIdSet;
import org.apache.lucene.util.SparseFixedBitSet;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.search.XDocIdSetIterator;
@ -109,11 +111,16 @@ public class DocIdSets {
}
/**
* Gets a set to bits.
* Get a build a {@link Bits} instance that will match all documents
* contained in {@code set}. Note that this is a potentially heavy
* operation as this might require to consume an iterator of this set
* entirely and to load it into a {@link BitSet}. Prefer using
* {@link #asSequentialAccessBits} if you only need to consume the
* {@link Bits} once and in order.
*/
public static Bits toSafeBits(LeafReader reader, @Nullable DocIdSet set) throws IOException {
public static Bits toSafeBits(int maxDoc, @Nullable DocIdSet set) throws IOException {
if (set == null) {
return new Bits.MatchNoBits(reader.maxDoc());
return new Bits.MatchNoBits(maxDoc);
}
Bits bits = set.bits();
if (bits != null) {
@ -121,9 +128,56 @@ public class DocIdSets {
}
DocIdSetIterator iterator = set.iterator();
if (iterator == null) {
return new Bits.MatchNoBits(reader.maxDoc());
return new Bits.MatchNoBits(maxDoc);
}
return toBitSet(iterator, reader.maxDoc());
return toBitSet(iterator, maxDoc);
}
/**
* Given a {@link DocIdSet}, return a {@link Bits} instance that will match
* all documents contained in the set. Note that the returned {@link Bits}
* instance should only be consumed once and in order.
*/
public static Bits asSequentialAccessBits(final int maxDoc, @Nullable DocIdSet set) throws IOException {
if (set == null) {
return new Bits.MatchNoBits(maxDoc);
}
Bits bits = set.bits();
if (bits != null) {
return bits;
}
final DocIdSetIterator iterator = set.iterator();
if (iterator == null) {
return new Bits.MatchNoBits(maxDoc);
}
return new Bits() {
int previous = 0;
@Override
public boolean get(int index) {
if (index < previous) {
throw new ElasticsearchIllegalArgumentException("This Bits instance can only be consumed in order. "
+ "Got called on [" + index + "] while previously called on [" + previous + "]");
}
previous = index;
int doc = iterator.docID();
if (doc < index) {
try {
doc = iterator.advance(index);
} catch (IOException e) {
throw new ElasticsearchIllegalStateException("Cannot advance iterator", e);
}
}
return index == doc;
}
@Override
public int length() {
return maxDoc;
}
};
}
/**

View File

@ -96,7 +96,7 @@ public class FilterableTermsEnum extends TermsEnum {
// fully filtered, none matching, no need to iterate on this
continue;
}
bits = DocIdSets.toSafeBits(context.reader(), docIdSet);
bits = DocIdSets.toSafeBits(context.reader().maxDoc(), docIdSet);
// Count how many docs are in our filtered set
// TODO make this lazy-loaded only for those that need it?
DocIdSetIterator iterator = docIdSet.iterator();

View File

@ -163,7 +163,7 @@ public class FiltersFunctionScoreQuery extends Query {
for (int i = 0; i < filterFunctions.length; i++) {
FilterFunction filterFunction = filterFunctions[i];
filterFunction.function.setNextReader(context);
docSets[i] = DocIdSets.toSafeBits(context.reader(), filterFunction.filter.getDocIdSet(context, acceptDocs));
docSets[i] = DocIdSets.asSequentialAccessBits(context.reader().maxDoc(), filterFunction.filter.getDocIdSet(context, acceptDocs));
}
return new FiltersFunctionFactorScorer(this, subQueryScorer, scoreMode, filterFunctions, maxBoost, docSets, combineFunction, minScore);
}
@ -186,7 +186,7 @@ public class FiltersFunctionScoreQuery extends Query {
weightSum++;
}
Bits docSet = DocIdSets.toSafeBits(context.reader(),
Bits docSet = DocIdSets.asSequentialAccessBits(context.reader().maxDoc(),
filterFunction.filter.getDocIdSet(context, context.reader().getLiveDocs()));
if (docSet.get(doc)) {
filterFunction.function.setNextReader(context);

View File

@ -129,7 +129,7 @@ public class ParentToChildrenAggregator extends SingleBucketAggregator implement
// The DocIdSets.toSafeBits(...) can convert to FixedBitSet, but this
// will only happen if the none filter cache is used. (which only happens in tests)
// Otherwise the filter cache will produce a bitset based filter.
parentDocs = DocIdSets.toSafeBits(reader.reader(), parentDocIdSet);
parentDocs = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), parentDocIdSet);
DocIdSet childDocIdSet = childFilter.getDocIdSet(reader, null);
if (globalOrdinals != null && !DocIdSets.isEmpty(childDocIdSet)) {
replay.add(reader);

View File

@ -51,7 +51,7 @@ public class FilterAggregator extends SingleBucketAggregator {
@Override
public void setNextReader(LeafReaderContext reader) {
try {
bits = DocIdSets.toSafeBits(reader.reader(), filter.getDocIdSet(reader, null));
bits = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), filter.getDocIdSet(reader, null));
} catch (IOException ioe) {
throw new AggregationExecutionException("Failed to aggregate filter aggregator [" + name + "]", ioe);
}

View File

@ -69,7 +69,7 @@ public class FiltersAggregator extends BucketsAggregator {
public void setNextReader(LeafReaderContext reader) {
try {
for (int i = 0; i < filters.length; i++) {
bits[i] = DocIdSets.toSafeBits(reader.reader(), filters[i].filter.getDocIdSet(reader, null));
bits[i] = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), filters[i].filter.getDocIdSet(reader, null));
}
} catch (IOException ioe) {
throw new AggregationExecutionException("Failed to aggregate filter aggregator [" + name + "]", ioe);

View File

@ -23,7 +23,11 @@ import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RoaringDocIdSet;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
@ -119,4 +123,46 @@ public class DocIdSetsTests extends ElasticsearchSingleNodeTest {
test(indexService, true, filter);
}
public void testAsSequentialAccessBits() throws IOException {
final int maxDoc = randomIntBetween(5, 100);
// Null DocIdSet maps to empty bits
Bits bits = DocIdSets.asSequentialAccessBits(100, null);
for (int i = 0; i < maxDoc; ++i) {
assertFalse(bits.get(i));
}
// Empty set maps to empty bits
bits = DocIdSets.asSequentialAccessBits(100, DocIdSet.EMPTY);
for (int i = 0; i < maxDoc; ++i) {
assertFalse(bits.get(i));
}
RoaringDocIdSet.Builder b = new RoaringDocIdSet.Builder(maxDoc);
for (int i = randomInt(maxDoc - 1); i < maxDoc; i += randomIntBetween(1, 10)) {
b.add(i);
}
final RoaringDocIdSet set = b.build();
// RoaringDocIdSet does not support random access
assertNull(set.bits());
bits = DocIdSets.asSequentialAccessBits(100, set);
bits.get(4);
try {
bits.get(2);
fail("Should have thrown an exception because of out-of-order consumption");
} catch (ElasticsearchIllegalArgumentException e) {
// ok
}
bits = DocIdSets.asSequentialAccessBits(100, set);
DocIdSetIterator iterator = set.iterator();
for (int i = randomInt(maxDoc - 1); i < maxDoc; i += randomIntBetween(1, 10)) {
if (iterator.docID() < i) {
iterator.advance(i);
}
assertEquals(iterator.docID() == i, bits.get(i));
}
}
}

View File

@ -18,6 +18,7 @@
*/
package org.elasticsearch.search.aggregations.bucket;
import org.apache.lucene.util.LuceneTestCase.AwaitsFix;
import org.elasticsearch.action.search.SearchPhaseExecutionException;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.ImmutableSettings;
@ -482,6 +483,7 @@ public class ReverseNestedTests extends ElasticsearchIntegrationTest {
}
@Test
@AwaitsFix(bugUrl="http://github.com/elasticsearch/elasticsearch/issues/9547")
public void testSameParentDocHavingMultipleBuckets() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("product").field("dynamic", "strict").startObject("properties")
.startObject("id").field("type", "long").endObject()