From 8540a863aa8225b9e5c7c4a94c8d0523901502e8 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 3 Feb 2015 14:55:41 +0100 Subject: [PATCH] Search: Avoid calling DocIdSets.toSafeBits. This method is heavy as it builds a bitset out of a DocIdSet in order to be able to provide random-access. Now that Lucene has removed out-of-order scoring true random-access is very rarely needed and we could instead return an Bits instance that wraps the iterator. Ideally, we would use the DISI API directly but I have to admit that the Bits API is more friendly. Close #9546 --- .../common/lucene/docset/DocIdSets.java | 64 +++++++++++++++++-- .../lucene/index/FilterableTermsEnum.java | 2 +- .../function/FiltersFunctionScoreQuery.java | 4 +- .../children/ParentToChildrenAggregator.java | 2 +- .../bucket/filter/FilterAggregator.java | 2 +- .../bucket/filters/FiltersAggregator.java | 2 +- .../common/lucene/docset/DocIdSetsTests.java | 46 +++++++++++++ .../bucket/ReverseNestedTests.java | 2 + 8 files changed, 113 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java index edafdb7b757..bffa699bc31 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java @@ -30,6 +30,8 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RoaringDocIdSet; import org.apache.lucene.util.SparseFixedBitSet; +import org.elasticsearch.ElasticsearchIllegalArgumentException; +import org.elasticsearch.ElasticsearchIllegalStateException; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lucene.search.XDocIdSetIterator; @@ -109,11 +111,16 @@ public class DocIdSets { } /** - * Gets a set to bits. + * Get a build a {@link Bits} instance that will match all documents + * contained in {@code set}. Note that this is a potentially heavy + * operation as this might require to consume an iterator of this set + * entirely and to load it into a {@link BitSet}. Prefer using + * {@link #asSequentialAccessBits} if you only need to consume the + * {@link Bits} once and in order. */ - public static Bits toSafeBits(LeafReader reader, @Nullable DocIdSet set) throws IOException { + public static Bits toSafeBits(int maxDoc, @Nullable DocIdSet set) throws IOException { if (set == null) { - return new Bits.MatchNoBits(reader.maxDoc()); + return new Bits.MatchNoBits(maxDoc); } Bits bits = set.bits(); if (bits != null) { @@ -121,9 +128,56 @@ public class DocIdSets { } DocIdSetIterator iterator = set.iterator(); if (iterator == null) { - return new Bits.MatchNoBits(reader.maxDoc()); + return new Bits.MatchNoBits(maxDoc); } - return toBitSet(iterator, reader.maxDoc()); + return toBitSet(iterator, maxDoc); + } + + /** + * Given a {@link DocIdSet}, return a {@link Bits} instance that will match + * all documents contained in the set. Note that the returned {@link Bits} + * instance should only be consumed once and in order. + */ + public static Bits asSequentialAccessBits(final int maxDoc, @Nullable DocIdSet set) throws IOException { + if (set == null) { + return new Bits.MatchNoBits(maxDoc); + } + Bits bits = set.bits(); + if (bits != null) { + return bits; + } + final DocIdSetIterator iterator = set.iterator(); + if (iterator == null) { + return new Bits.MatchNoBits(maxDoc); + } + return new Bits() { + + int previous = 0; + + @Override + public boolean get(int index) { + if (index < previous) { + throw new ElasticsearchIllegalArgumentException("This Bits instance can only be consumed in order. " + + "Got called on [" + index + "] while previously called on [" + previous + "]"); + } + previous = index; + + int doc = iterator.docID(); + if (doc < index) { + try { + doc = iterator.advance(index); + } catch (IOException e) { + throw new ElasticsearchIllegalStateException("Cannot advance iterator", e); + } + } + return index == doc; + } + + @Override + public int length() { + return maxDoc; + } + }; } /** diff --git a/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java b/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java index f8e84d9a6de..3207e92182d 100644 --- a/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java +++ b/src/main/java/org/elasticsearch/common/lucene/index/FilterableTermsEnum.java @@ -96,7 +96,7 @@ public class FilterableTermsEnum extends TermsEnum { // fully filtered, none matching, no need to iterate on this continue; } - bits = DocIdSets.toSafeBits(context.reader(), docIdSet); + bits = DocIdSets.toSafeBits(context.reader().maxDoc(), docIdSet); // Count how many docs are in our filtered set // TODO make this lazy-loaded only for those that need it? DocIdSetIterator iterator = docIdSet.iterator(); diff --git a/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java index bf26ea2cee1..1041043a194 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/function/FiltersFunctionScoreQuery.java @@ -163,7 +163,7 @@ public class FiltersFunctionScoreQuery extends Query { for (int i = 0; i < filterFunctions.length; i++) { FilterFunction filterFunction = filterFunctions[i]; filterFunction.function.setNextReader(context); - docSets[i] = DocIdSets.toSafeBits(context.reader(), filterFunction.filter.getDocIdSet(context, acceptDocs)); + docSets[i] = DocIdSets.asSequentialAccessBits(context.reader().maxDoc(), filterFunction.filter.getDocIdSet(context, acceptDocs)); } return new FiltersFunctionFactorScorer(this, subQueryScorer, scoreMode, filterFunctions, maxBoost, docSets, combineFunction, minScore); } @@ -186,7 +186,7 @@ public class FiltersFunctionScoreQuery extends Query { weightSum++; } - Bits docSet = DocIdSets.toSafeBits(context.reader(), + Bits docSet = DocIdSets.asSequentialAccessBits(context.reader().maxDoc(), filterFunction.filter.getDocIdSet(context, context.reader().getLiveDocs())); if (docSet.get(doc)) { filterFunction.function.setNextReader(context); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/children/ParentToChildrenAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/children/ParentToChildrenAggregator.java index f08d20cb309..39ee460cbb0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/children/ParentToChildrenAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/children/ParentToChildrenAggregator.java @@ -129,7 +129,7 @@ public class ParentToChildrenAggregator extends SingleBucketAggregator implement // The DocIdSets.toSafeBits(...) can convert to FixedBitSet, but this // will only happen if the none filter cache is used. (which only happens in tests) // Otherwise the filter cache will produce a bitset based filter. - parentDocs = DocIdSets.toSafeBits(reader.reader(), parentDocIdSet); + parentDocs = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), parentDocIdSet); DocIdSet childDocIdSet = childFilter.getDocIdSet(reader, null); if (globalOrdinals != null && !DocIdSets.isEmpty(childDocIdSet)) { replay.add(reader); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FilterAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FilterAggregator.java index bae4036d832..25052e2a4e6 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FilterAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/filter/FilterAggregator.java @@ -51,7 +51,7 @@ public class FilterAggregator extends SingleBucketAggregator { @Override public void setNextReader(LeafReaderContext reader) { try { - bits = DocIdSets.toSafeBits(reader.reader(), filter.getDocIdSet(reader, null)); + bits = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), filter.getDocIdSet(reader, null)); } catch (IOException ioe) { throw new AggregationExecutionException("Failed to aggregate filter aggregator [" + name + "]", ioe); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/filters/FiltersAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/filters/FiltersAggregator.java index da925c9511d..3c891f9a5ba 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/filters/FiltersAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/filters/FiltersAggregator.java @@ -69,7 +69,7 @@ public class FiltersAggregator extends BucketsAggregator { public void setNextReader(LeafReaderContext reader) { try { for (int i = 0; i < filters.length; i++) { - bits[i] = DocIdSets.toSafeBits(reader.reader(), filters[i].filter.getDocIdSet(reader, null)); + bits[i] = DocIdSets.asSequentialAccessBits(reader.reader().maxDoc(), filters[i].filter.getDocIdSet(reader, null)); } } catch (IOException ioe) { throw new AggregationExecutionException("Failed to aggregate filter aggregator [" + name + "]", ioe); diff --git a/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java index 5eb2f28da7d..e57bd0c9e8f 100644 --- a/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java +++ b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java @@ -23,7 +23,11 @@ import java.io.IOException; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Filter; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.RoaringDocIdSet; +import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; @@ -119,4 +123,46 @@ public class DocIdSetsTests extends ElasticsearchSingleNodeTest { test(indexService, true, filter); } + public void testAsSequentialAccessBits() throws IOException { + final int maxDoc = randomIntBetween(5, 100); + + // Null DocIdSet maps to empty bits + Bits bits = DocIdSets.asSequentialAccessBits(100, null); + for (int i = 0; i < maxDoc; ++i) { + assertFalse(bits.get(i)); + } + + // Empty set maps to empty bits + bits = DocIdSets.asSequentialAccessBits(100, DocIdSet.EMPTY); + for (int i = 0; i < maxDoc; ++i) { + assertFalse(bits.get(i)); + } + + RoaringDocIdSet.Builder b = new RoaringDocIdSet.Builder(maxDoc); + for (int i = randomInt(maxDoc - 1); i < maxDoc; i += randomIntBetween(1, 10)) { + b.add(i); + } + final RoaringDocIdSet set = b.build(); + // RoaringDocIdSet does not support random access + assertNull(set.bits()); + + bits = DocIdSets.asSequentialAccessBits(100, set); + bits.get(4); + try { + bits.get(2); + fail("Should have thrown an exception because of out-of-order consumption"); + } catch (ElasticsearchIllegalArgumentException e) { + // ok + } + + bits = DocIdSets.asSequentialAccessBits(100, set); + DocIdSetIterator iterator = set.iterator(); + for (int i = randomInt(maxDoc - 1); i < maxDoc; i += randomIntBetween(1, 10)) { + if (iterator.docID() < i) { + iterator.advance(i); + } + + assertEquals(iterator.docID() == i, bits.get(i)); + } + } } diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/ReverseNestedTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/ReverseNestedTests.java index 2b782d869f1..91c345d4dc8 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/ReverseNestedTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/ReverseNestedTests.java @@ -18,6 +18,7 @@ */ package org.elasticsearch.search.aggregations.bucket; +import org.apache.lucene.util.LuceneTestCase.AwaitsFix; import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.settings.ImmutableSettings; @@ -482,6 +483,7 @@ public class ReverseNestedTests extends ElasticsearchIntegrationTest { } @Test + @AwaitsFix(bugUrl="http://github.com/elasticsearch/elasticsearch/issues/9547") public void testSameParentDocHavingMultipleBuckets() throws Exception { XContentBuilder mapping = jsonBuilder().startObject().startObject("product").field("dynamic", "strict").startObject("properties") .startObject("id").field("type", "long").endObject()