diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java index d988fe17997..c9d9f8b513a 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java @@ -19,17 +19,21 @@ package org.elasticsearch.common.lucene.docset; +import com.google.common.collect.Iterables; + import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; /** @@ -78,18 +82,21 @@ public class AndDocIdSet extends DocIdSet { public DocIdSetIterator iterator() throws IOException { // we try and be smart here, if we can iterate through docsets quickly, prefer to iterate // over them as much as possible, before actually going to "bits" based ones to check - List iterators = new ArrayList<>(sets.length); + List iterators = new ArrayList<>(sets.length); List bits = new ArrayList<>(sets.length); for (DocIdSet set : sets) { - if (DocIdSets.isFastIterator(set)) { - iterators.add(set); + if (DocIdSets.isEmpty(set)) { + return DocIdSetIterator.empty(); + } + DocIdSetIterator it = set.iterator(); + if (it == null) { + return DocIdSetIterator.empty(); + } + Bits bit = set.bits(); + if (bit != null && DocIdSets.isBroken(it)) { + bits.add(bit); } else { - Bits bit = set.bits(); - if (bit != null) { - bits.add(bit); - } else { - iterators.add(set); - } + iterators.add(it); } } if (bits.isEmpty()) { @@ -131,38 +138,25 @@ public class AndDocIdSet extends DocIdSet { } } - static class IteratorBasedIterator extends DocIdSetIterator { + static class IteratorBasedIterator extends XDocIdSetIterator { private int doc = -1; private final DocIdSetIterator lead; private final DocIdSetIterator[] otherIterators; - public static DocIdSetIterator newDocIdSetIterator(Collection sets) throws IOException { - if (sets.isEmpty()) { - return DocIdSetIterator.empty(); + public static DocIdSetIterator newDocIdSetIterator(Collection iterators) throws IOException { + if (iterators.isEmpty()) { + return DocIdSetIterator.empty(); } - final DocIdSetIterator[] iterators = new DocIdSetIterator[sets.size()]; - int j = 0; - for (DocIdSet set : sets) { - if (set == null) { - return DocIdSetIterator.empty(); - } else { - DocIdSetIterator docIdSetIterator = set.iterator(); - if (docIdSetIterator == null) { - return DocIdSetIterator.empty();// non matching - } - iterators[j++] = docIdSetIterator; - } - } - if (sets.size() == 1) { + if (iterators.size() == 1) { // shortcut if there is only one valid iterator. - return iterators[0]; + return iterators.iterator().next(); } return new IteratorBasedIterator(iterators); } - private IteratorBasedIterator(DocIdSetIterator[] iterators) throws IOException { - final DocIdSetIterator[] sortedIterators = Arrays.copyOf(iterators, iterators.length); + private IteratorBasedIterator(Collection iterators) throws IOException { + final DocIdSetIterator[] sortedIterators = iterators.toArray(new DocIdSetIterator[iterators.size()]); new InPlaceMergeSorter() { @Override @@ -180,6 +174,16 @@ public class AndDocIdSet extends DocIdSet { this.otherIterators = Arrays.copyOfRange(sortedIterators, 1, sortedIterators.length); } + @Override + public boolean isBroken() { + for (DocIdSetIterator it : Iterables.concat(Collections.singleton(lead), Arrays.asList(otherIterators))) { + if (DocIdSets.isBroken(it)) { + return true; + } + } + return false; + } + @Override public final int docID() { return doc; diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java index 4102c645206..edafdb7b757 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/DocIdSets.java @@ -20,9 +20,10 @@ package org.elasticsearch.common.lucene.docset; import org.apache.lucene.index.LeafReader; -import org.apache.lucene.search.BitsFilteredDocIdSet; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesDocIdSet; +import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.BitSet; import org.apache.lucene.util.Bits; @@ -30,6 +31,7 @@ import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RoaringDocIdSet; import org.apache.lucene.util.SparseFixedBitSet; import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; @@ -52,15 +54,28 @@ public class DocIdSets { } /** - * Is {@link org.apache.lucene.search.DocIdSetIterator} implemented in a "fast" manner. - * For example, it does not ends up iterating one doc at a time check for its "value". + * Check if the given iterator can nextDoc() or advance() in sub-linear time + * of the number of documents. For instance, an iterator that would need to + * iterate one document at a time to check for its value would be considered + * broken. */ - public static boolean isFastIterator(DocIdSet set) { - // TODO: this is really horrible - while (set instanceof BitsFilteredDocIdSet) { - set = ((BitsFilteredDocIdSet) set).getDelegate(); + public static boolean isBroken(DocIdSetIterator iterator) { + while (iterator instanceof FilteredDocIdSetIterator) { + // this iterator is filtered (likely by some bits) + // unwrap in order to check if the underlying iterator is fast + iterator = ((FilteredDocIdSetIterator) iterator).getDelegate(); } - return set instanceof BitDocIdSet || set instanceof RoaringDocIdSet; + if (iterator instanceof XDocIdSetIterator) { + return ((XDocIdSetIterator) iterator).isBroken(); + } + if (iterator instanceof MatchDocIdSetIterator) { + return true; + } + // DocValuesDocIdSet produces anonymous slow iterators + if (iterator != null && DocValuesDocIdSet.class.equals(iterator.getClass().getEnclosingClass())) { + return true; + } + return false; } /** diff --git a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java index dfb6157fb36..8324e444267 100644 --- a/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java +++ b/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java @@ -23,9 +23,9 @@ import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Bits; import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.common.lucene.search.XDocIdSetIterator; import java.io.IOException; -import java.util.Collection; /** * @@ -99,7 +99,7 @@ public class OrDocIdSet extends DocIdSet { } } - static class IteratorBasedIterator extends DocIdSetIterator { + static class IteratorBasedIterator extends XDocIdSetIterator { final class Item { public final DocIdSetIterator iter; @@ -115,23 +115,32 @@ public class OrDocIdSet extends DocIdSet { private final Item[] _heap; private int _size; private final long cost; + private final boolean broken; IteratorBasedIterator(DocIdSet[] sets) throws IOException { _curDoc = -1; _heap = new Item[sets.length]; _size = 0; long cost = 0; + boolean broken = false; for (DocIdSet set : sets) { DocIdSetIterator iterator = set.iterator(); + broken |= DocIdSets.isBroken(iterator); if (iterator != null) { _heap[_size++] = new Item(iterator); cost += iterator.cost(); } } this.cost = cost; + this.broken = broken; if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS; } + @Override + public boolean isBroken() { + return broken; + } + @Override public final int docID() { return _curDoc; diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java b/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java index 46740fcf898..bca81e85c53 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/XBooleanFilter.java @@ -124,29 +124,29 @@ public class XBooleanFilter extends Filter implements Iterable { // continue, but we recorded that there is at least one should clause // so that if all iterators are null we know that nothing matches this // filter since at least one SHOULD clause needs to match - } else if (bits == null || DocIdSets.isFastIterator(set)) { - shouldIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { shouldBits.add(bits); + } else { + shouldIterators.add(it); } break; case MUST: if (it == null) { // no documents matched a clause that is compulsory, then nothing matches at all return null; - } else if (bits == null || DocIdSets.isFastIterator(set)) { - requiredIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { requiredBits.add(bits); + } else { + requiredIterators.add(it); } break; case MUST_NOT: if (it == null) { // ignore - } else if (bits == null || DocIdSets.isFastIterator(set)) { - excludedIterators.add(it); - } else { + } else if (bits != null && DocIdSets.isBroken(it)) { excludedBits.add(bits); + } else { + excludedIterators.add(it); } break; default: diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java b/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java new file mode 100644 index 00000000000..6f89967280c --- /dev/null +++ b/src/main/java/org/elasticsearch/common/lucene/search/XDocIdSetIterator.java @@ -0,0 +1,40 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.search; + +import org.apache.lucene.search.DocIdSetIterator; +import org.elasticsearch.common.lucene.docset.DocIdSets; + +/** + * Extension of {@link DocIdSetIterator} that allows to know if iteration is + * implemented efficiently. + */ +public abstract class XDocIdSetIterator extends DocIdSetIterator { + + /** + * Return true if this iterator cannot both + * {@link DocIdSetIterator#nextDoc} and {@link DocIdSetIterator#advance} + * in sub-linear time. + * + * Do not call this method directly, use {@link DocIdSets#isBroken}. + */ + public abstract boolean isBroken(); + +} diff --git a/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java b/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java index 228fb91d466..ba454ab8b1f 100644 --- a/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/FilteredQueryParser.java @@ -76,8 +76,8 @@ public class FilteredQueryParser implements QueryParser { @Override public Scorer filteredScorer(LeafReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { // CHANGE: If threshold is 0, always pass down the accept docs, don't pay the price of calling nextDoc even... + final Bits filterAcceptDocs = docIdSet.bits(); if (threshold == 0) { - final Bits filterAcceptDocs = docIdSet.bits(); if (filterAcceptDocs != null) { return weight.scorer(context, filterAcceptDocs); } else { @@ -88,7 +88,8 @@ public class FilteredQueryParser implements QueryParser { // CHANGE: handle "default" value if (threshold == -1) { // default value, don't iterate on only apply filter after query if its not a "fast" docIdSet - if (!DocIdSets.isFastIterator(docIdSet)) { + // TODO: is there a way we could avoid creating an iterator here? + if (filterAcceptDocs != null && DocIdSets.isBroken(docIdSet.iterator())) { return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet); } } diff --git a/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java new file mode 100644 index 00000000000..f3f2ae348cb --- /dev/null +++ b/src/test/java/org/elasticsearch/common/lucene/docset/DocIdSetsTests.java @@ -0,0 +1,122 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.common.lucene.docset; + +import java.io.IOException; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.Filter; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.DistanceUnit; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.elasticsearch.index.engine.Engine.Searcher; +import org.elasticsearch.index.query.FilterBuilder; +import org.elasticsearch.index.query.FilterBuilders; +import org.elasticsearch.index.query.TermFilterBuilder; +import org.elasticsearch.index.service.IndexService; +import org.elasticsearch.test.ElasticsearchSingleNodeTest; + +public class DocIdSetsTests extends ElasticsearchSingleNodeTest { + + private static final Settings SINGLE_SHARD_SETTINGS = ImmutableSettings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).build(); + + private void test(IndexService indexService, boolean broken, FilterBuilder filterBuilder) throws IOException { + client().admin().indices().prepareRefresh("test").get(); + XContentBuilder builder = filterBuilder.toXContent(JsonXContent.contentBuilder(), ToXContent.EMPTY_PARAMS); + XContentParser parser = JsonXContent.jsonXContent.createParser(builder.bytes()); + Filter filter = indexService.queryParserService().parseInnerFilter(parser).filter(); + try (Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) { + final LeafReaderContext ctx = searcher.reader().leaves().get(0); + DocIdSet set = filter.getDocIdSet(ctx, null); + assertEquals(broken, DocIdSets.isBroken(set.iterator())); + } + } + + public void testTermIsNotBroken() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + TermFilterBuilder filter = FilterBuilders.termFilter("l", 7).cache(randomBoolean()); + test(indexService, false, filter); + } + + public void testDefaultGeoIsBroken() throws IOException { + // Geo is slow by default :'( + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point"); + client().prepareIndex("test", "type").setSource("gp", "2,3").get(); + FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2); + test(indexService, true, filter); + + } + + public void testIndexedGeoIsNotBroken() throws IOException { + // Geo has a fast iterator when indexing lat,lon and using the "indexed" bbox optimization + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point,lat_lon=true"); + client().prepareIndex("test", "type").setSource("gp", "2,3").get(); + FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2).optimizeBbox("indexed"); + test(indexService, false, filter); + } + + public void testScriptIsBroken() throws IOException { // by nature unfortunately + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8"); + test(indexService, true, filter); + } + + public void testCachedIsNotBroken() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", 7).get(); + // This filter is inherently slow but by caching it we pay the price at caching time, not iteration + FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8").cache(true); + test(indexService, false, filter); + } + + public void testOr() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get(); + // Or with fast clauses is fast + FilterBuilder filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8)); + test(indexService, false, filter); + // But if at least one clause is broken, it is broken + filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, true, filter); + } + + public void testAnd() throws IOException { + IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long"); + client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get(); + // And with fast clauses is fast + FilterBuilder filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8)); + test(indexService, false, filter); + // If at least one clause is 'fast' and the other clauses supports random-access, it is still fast + filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7).cache(randomBoolean()), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, false, filter); + // However if all clauses are broken, the and is broken + filter = FilterBuilders.andFilter(FilterBuilders.scriptFilter("doc['l'].value > 5"), FilterBuilders.scriptFilter("doc['l'].value < 8")); + test(indexService, true, filter); + } + +}