From 144813629a98f9aaa0e765666290925df2f44fe6 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 7 Nov 2014 09:17:50 +0100 Subject: [PATCH] Internal: Inverse DocIdSets' heuristic to find out fast DocIdSets. DocIdSets.isFast(DocIdSet) has two issues: - it works on the DocIdSet interface while some doc sets can generate either slow or fast sets depending on their options (eg. whether an OrDocIdSet is fast or not depends on the wrapped clauses). - it only works because the result of this method is only taken into account when a DocIdSet has non-null `bits()`. This commit changes this method to work on top of a DocIdSetIterator and to use a black-list rather than a white list: slow iterators should really be the exception rather than the rule. Close #8380 --- .../common/lucene/docset/AndDocIdSet.java | 64 ++++----- .../common/lucene/docset/DocIdSets.java | 31 +++-- .../common/lucene/docset/OrDocIdSet.java | 13 +- .../common/lucene/search/XBooleanFilter.java | 18 +-- .../lucene/search/XDocIdSetIterator.java | 40 ++++++ .../index/query/FilteredQueryParser.java | 5 +- .../common/lucene/docset/DocIdSetsTests.java | 122 ++++++++++++++++++ 7 files changed, 242 insertions(+), 51 deletions(-) create mode 100644 src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java create mode 100644 src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java index d988fe17997..c9d9f8b513a 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java @@ -19,17 +19,21 @@ package org.elasticsearch.common.lucene.docset; +import com.google.common.collect.Iterables; + import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; /** @@ -78,18 +82,21 @@ public class AndDocIdSet extends DocIdSet { public DocIdSetIterator iterator() throws IOException { // we try and be smart here, if we can iterate through docsets quickly, prefer to iterate // over them as much as possible, before actually going to "bits" based ones to check - List iterators = new ArrayList<>(sets.length); + List iterators = new ArrayList<>(sets.length); List bits = new ArrayList<>(sets.length); for (DocIdSet set : sets) { - if (DocIdSets.isFastIterator(set)) { - iterators.add(set); + if (DocIdSets.isEmpty(set)) { + return DocIdSetIterator.empty(); + } + DocIdSetIterator it = set.iterator(); + if (it == null) { + return DocIdSetIterator.empty(); + } + Bits bit = set.bits(); + if (bit != null && DocIdSets.isBroken(it)) { + bits.add(bit); } else { - Bits bit = set.bits(); - if (bit != null) { - bits.add(bit); - } else { - iterators.add(set); - } + iterators.add(it); } } if (bits.isEmpty()) { @@ -131,38 +138,25 @@ public class AndDocIdSet extends DocIdSet { } } - static class IteratorBasedIterator extends DocIdSetIterator { + static class IteratorBasedIterator extends XDocIdSetIterator { private int doc = -1; private final DocIdSetIterator lead; private final DocIdSetIterator[] otherIterators; - public static DocIdSetIterator newDocIdSetIterator(Collection sets) throws IOException { - if (sets.isEmpty()) { - return DocIdSetIterator.empty(); + public static DocIdSetIterator newDocIdSetIterator(Collection iterators) throws IOException { + if (iterators.isEmpty()) { + return DocIdSetIterator.empty(); } - final DocIdSetIterator[] iterators = new DocIdSetIterator[sets.size()]; - int j = 0; - for (DocIdSet set : sets) { - if (set == null) { - return DocIdSetIterator.empty(); - } else { - DocIdSetIterator docIdSetIterator = set.iterator(); - if (docIdSetIterator == null) { - return DocIdSetIterator.empty();// non matching - } - iterators[j++] = docIdSetIterator; - } - } - if (sets.size() == 1) { + if (iterators.size() == 1) { // shortcut if there is only one valid iterator. - return iterators[0]; + return iterators.iterator().next(); } return new IteratorBasedIterator(iterators); } - private IteratorBasedIterator(DocIdSetIterator[] iterators) throws IOException { - final DocIdSetIterator[] sortedIterators = Arrays.copyOf(iterators, iterators.length); + private IteratorBasedIterator(Collection iterators) throws IOException { + final DocIdSetIterator[] sortedIterators = iterators.toArray(new DocIdSetIterator[iterators.size()]); new InPlaceMergeSorter() { @Override @@ -180,6 +174,16 @@ public class AndDocIdSet extends DocIdSet { this.otherIterators = Arrays.copyOfRange(sortedIterators, 1, sortedIterators.length); } + @Override + public boolean isBroken() { + for (DocIdSetIterator it : Iterables.concat(Collections.singleton(lead), Arrays.asList(otherIterators))) { + if (DocIdSets.isBroken(it)) { + return true; + } + } + return false; + } + @Override public final int docID() { return doc; diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java index 4102c645206..edafdb7b757 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java @@ -20,9 +20,10 @@ package org.elasticsearch.common.lucene.docset; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.search.BitsFilteredDocIdSet; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesDocIdSet; +import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; @@ -30,6 +31,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RoaringDocIdSet; import org.apache.lucene.util.SparseFixedBitSet; import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; @@ -52,15 +54,28 @@ public class DocIdSets { } /** - * Is {@link org.apache.lucene.search.DocIdSetIterator} implemented in a "fast" manner. - * For example, it does not ends up iterating one doc at a time check for its "value". + * Check if the given iterator can nextDoc() or advance() in sub-linear time + * of the number of documents. For instance, an iterator that would need to + * iterate one document at a time to check for its value would be considered + * broken. */ - public static boolean isFastIterator(DocIdSet set) { - // TODO: this is really horrible - while (set instanceof BitsFilteredDocIdSet) { - set = ((BitsFilteredDocIdSet) set).getDelegate(); + public static boolean isBroken(DocIdSetIterator iterator) { + while (iterator instanceof FilteredDocIdSetIterator) { + // this iterator is filtered (likely by some bits) + // unwrap in order to check if the underlying iterator is fast + iterator = ((FilteredDocIdSetIterator) iterator).getDelegate(); } - return set instanceof BitDocIdSet || set instanceof RoaringDocIdSet; + if (iterator instanceof XDocIdSetIterator) { + return ((XDocIdSetIterator) iterator).isBroken(); + } + if (iterator instanceof MatchDocIdSetIterator) { + return true; + } + // DocValuesDocIdSet produces anonymous slow iterators + if (iterator != null && DocValuesDocIdSet.class.equals(iterator.getClass().getEnclosingClass())) { + return true; + } + return false; } /** diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java index dfb6157fb36..8324e444267 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java @@ -23,9 +23,9 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; -import java.util.Collection; /** * @@ -99,7 +99,7 @@ public class OrDocIdSet extends DocIdSet { } } - static class IteratorBasedIterator extends DocIdSetIterator { + static class IteratorBasedIterator extends XDocIdSetIterator { final class Item { public final DocIdSetIterator iter; @@ -115,23 +115,32 @@ public class OrDocIdSet extends DocIdSet { private final Item[] _heap; private int _size; private final long cost; + private final boolean broken; IteratorBasedIterator(DocIdSet[] sets) throws IOException { _curDoc = -1; _heap = new Item[sets.length]; _size = 0; long cost = 0; + boolean broken = false; for (DocIdSet set : sets) { DocIdSetIterator iterator = set.iterator(); + broken |= DocIdSets.isBroken(iterator); if (iterator != null) { _heap[_size++] = new Item(iterator); cost += iterator.cost(); } } this.cost = cost; + this.broken = broken; if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS; } + @Override + public boolean isBroken() { + return broken; + } + @Override public final int docID() { return _curDoc; diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java b/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java index 46740fcf898..bca81e85c53 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java @@ -124,29 +124,29 @@ public class XBooleanFilter extends Filter implements Iterable { // continue, but we recorded that there is at least one should clause // so that if all iterators are null we know that nothing matches this // filter since at least one SHOULD clause needs to match - } else if (bits == null || DocIdSets.isFastIterator(set)) { - shouldIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { shouldBits.add(bits); + } else { + shouldIterators.add(it); } break; case MUST: if (it == null) { // no documents matched a clause that is compulsory, then nothing matches at all return null; - } else if (bits == null || DocIdSets.isFastIterator(set)) { - requiredIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { requiredBits.add(bits); + } else { + requiredIterators.add(it); } break; case MUST_NOT: if (it == null) { // ignore - } else if (bits == null || DocIdSets.isFastIterator(set)) { - excludedIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { excludedBits.add(bits); + } else { + excludedIterators.add(it); } break; default: diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java b/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java new file mode 100644 index 00000000000..6f89967280c --- /dev/null +++ b/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java @@ -0,0 +1,40 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.search; + +import org.apache.lucene.search.DocIdSetIterator; +import org.elasticsearch.common.lucene.docset.DocIdSets; + +/** + * Extension of {@link DocIdSetIterator} that allows to know if iteration is + * implemented efficiently. + */ +public abstract class XDocIdSetIterator extends DocIdSetIterator { + + /** + * Return true if this iterator cannot both + * {@link DocIdSetIterator#nextDoc} and {@link DocIdSetIterator#advance} + * in sub-linear time. + * + * Do not call this method directly, use {@link DocIdSets#isBroken}. + */ + public abstract boolean isBroken(); + +} diff --git a/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java b/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java index 228fb91d466..ba454ab8b1f 100644 --- a/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java @@ -76,8 +76,8 @@ public class FilteredQueryParser implements QueryParser { @Override public Scorer filteredScorer(LeafReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { // CHANGE: If threshold is 0, always pass down the accept docs, don't pay the price of calling nextDoc even... + final Bits filterAcceptDocs = docIdSet.bits(); if (threshold == 0) { - final Bits filterAcceptDocs = docIdSet.bits(); if (filterAcceptDocs != null) { return weight.scorer(context, filterAcceptDocs); } else { @@ -88,7 +88,8 @@ public class FilteredQueryParser implements QueryParser { // CHANGE: handle "default" value if (threshold == -1) { // default value, don't iterate on only apply filter after query if its not a "fast" docIdSet - if (!DocIdSets.isFastIterator(docIdSet)) { + // TODO: is there a way we could avoid creating an iterator here? + if (filterAcceptDocs != null && DocIdSets.isBroken(docIdSet.iterator())) { return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet); } } diff --git a/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java new file mode 100644 index 00000000000..f3f2ae348cb --- /dev/null +++ b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java @@ -0,0 +1,122 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.docset; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Filter; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.DistanceUnit; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.engine.Engine.Searcher; +import org.elasticsearch.index.query.FilterBuilder; +import org.elasticsearch.index.query.FilterBuilders; +import org.elasticsearch.index.query.TermFilterBuilder; +import org.elasticsearch.index.service.IndexService; +import org.elasticsearch.test.ElasticsearchSingleNodeTest; + +public class DocIdSetsTests extends ElasticsearchSingleNodeTest { + + private static final Settings SINGLE_SHARD_SETTINGS = ImmutableSettings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).build(); + + private void test(IndexService indexService, boolean broken, FilterBuilder filterBuilder) throws IOException { + client().admin().indices().prepareRefresh("test").get(); + XContentBuilder builder = filterBuilder.toXContent(JsonXContent.contentBuilder(), ToXContent.EMPTY_PARAMS); + XContentParser parser = JsonXContent.jsonXContent.createParser(builder.bytes()); + Filter filter = indexService.queryParserService().parseInnerFilter(parser).filter(); + try (Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) { + final LeafReaderContext ctx = searcher.reader().leaves().get(0); + DocIdSet set = filter.getDocIdSet(ctx, null); + assertEquals(broken, DocIdSets.isBroken(set.iterator())); + } + } + + public void testTermIsNotBroken() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + TermFilterBuilder filter = FilterBuilders.termFilter("l", 7).cache(randomBoolean()); + test(indexService, false, filter); + } + + public void testDefaultGeoIsBroken() throws IOException { + // Geo is slow by default :'( + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point"); + client().prepareIndex("test", "type").setSource("gp", "2,3").get(); + FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2); + test(indexService, true, filter); + + } + + public void testIndexedGeoIsNotBroken() throws IOException { + // Geo has a fast iterator when indexing lat,lon and using the "indexed" bbox optimization + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point,lat_lon=true"); + client().prepareIndex("test", "type").setSource("gp", "2,3").get(); + FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2).optimizeBbox("indexed"); + test(indexService, false, filter); + } + + public void testScriptIsBroken() throws IOException { // by nature unfortunately + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8"); + test(indexService, true, filter); + } + + public void testCachedIsNotBroken() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + // This filter is inherently slow but by caching it we pay the price at caching time, not iteration + FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8").cache(true); + test(indexService, false, filter); + } + + public void testOr() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get(); + // Or with fast clauses is fast + FilterBuilder filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8)); + test(indexService, false, filter); + // But if at least one clause is broken, it is broken + filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, true, filter); + } + + public void testAnd() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get(); + // And with fast clauses is fast + FilterBuilder filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8)); + test(indexService, false, filter); + // If at least one clause is 'fast' and the other clauses supports random-access, it is still fast + filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7).cache(randomBoolean()), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, false, filter); + // However if all clauses are broken, the and is broken + filter = FilterBuilders.andFilter(FilterBuilders.scriptFilter("doc['l'].value > 5"), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, true, filter); + } + +}