Internal: Inverse DocIdSets' heuristic to find out fast DocIdSets.

DocIdSets.isFast(DocIdSet) has two issues:
 - it works on the DocIdSet interface while some doc sets can generate either
   slow or fast sets depending on their options (eg. whether an OrDocIdSet is
   fast or not depends on the wrapped clauses).
 - it only works because the result of this method is only taken into account
   when a DocIdSet has non-null `bits()`.

This commit changes this method to work on top of a DocIdSetIterator and to use
a black-list rather than a white list: slow iterators should really be the
exception rather than the rule.

Close #8380
This commit is contained in:
Adrien Grand 2014-11-07 09:17:50 +01:00
parent a1d5bcaa35
commit 144813629a
7 changed files with 242 additions and 51 deletions

View File

@ -19,17 +19,21 @@
package org.elasticsearch.common.lucene.docset; package org.elasticsearch.common.lucene.docset;
import com.google.common.collect.Iterables;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.lucene.search.XDocIdSetIterator;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
/** /**
@ -78,18 +82,21 @@ public class AndDocIdSet extends DocIdSet {
public DocIdSetIterator iterator() throws IOException { public DocIdSetIterator iterator() throws IOException {
// we try and be smart here, if we can iterate through docsets quickly, prefer to iterate // we try and be smart here, if we can iterate through docsets quickly, prefer to iterate
// over them as much as possible, before actually going to "bits" based ones to check // over them as much as possible, before actually going to "bits" based ones to check
List<DocIdSet> iterators = new ArrayList<>(sets.length); List<DocIdSetIterator> iterators = new ArrayList<>(sets.length);
List<Bits> bits = new ArrayList<>(sets.length); List<Bits> bits = new ArrayList<>(sets.length);
for (DocIdSet set : sets) { for (DocIdSet set : sets) {
if (DocIdSets.isFastIterator(set)) { if (DocIdSets.isEmpty(set)) {
iterators.add(set); return DocIdSetIterator.empty();
} else { }
DocIdSetIterator it = set.iterator();
if (it == null) {
return DocIdSetIterator.empty();
}
Bits bit = set.bits(); Bits bit = set.bits();
if (bit != null) { if (bit != null && DocIdSets.isBroken(it)) {
bits.add(bit); bits.add(bit);
} else { } else {
iterators.add(set); iterators.add(it);
}
} }
} }
if (bits.isEmpty()) { if (bits.isEmpty()) {
@ -131,38 +138,25 @@ public class AndDocIdSet extends DocIdSet {
} }
} }
static class IteratorBasedIterator extends DocIdSetIterator { static class IteratorBasedIterator extends XDocIdSetIterator {
private int doc = -1; private int doc = -1;
private final DocIdSetIterator lead; private final DocIdSetIterator lead;
private final DocIdSetIterator[] otherIterators; private final DocIdSetIterator[] otherIterators;
public static DocIdSetIterator newDocIdSetIterator(Collection<DocIdSet> sets) throws IOException { public static DocIdSetIterator newDocIdSetIterator(Collection<DocIdSetIterator> iterators) throws IOException {
if (sets.isEmpty()) { if (iterators.isEmpty()) {
return DocIdSetIterator.empty(); return DocIdSetIterator.empty();
} }
final DocIdSetIterator[] iterators = new DocIdSetIterator[sets.size()]; if (iterators.size() == 1) {
int j = 0;
for (DocIdSet set : sets) {
if (set == null) {
return DocIdSetIterator.empty();
} else {
DocIdSetIterator docIdSetIterator = set.iterator();
if (docIdSetIterator == null) {
return DocIdSetIterator.empty();// non matching
}
iterators[j++] = docIdSetIterator;
}
}
if (sets.size() == 1) {
// shortcut if there is only one valid iterator. // shortcut if there is only one valid iterator.
return iterators[0]; return iterators.iterator().next();
} }
return new IteratorBasedIterator(iterators); return new IteratorBasedIterator(iterators);
} }
private IteratorBasedIterator(DocIdSetIterator[] iterators) throws IOException { private IteratorBasedIterator(Collection<DocIdSetIterator> iterators) throws IOException {
final DocIdSetIterator[] sortedIterators = Arrays.copyOf(iterators, iterators.length); final DocIdSetIterator[] sortedIterators = iterators.toArray(new DocIdSetIterator[iterators.size()]);
new InPlaceMergeSorter() { new InPlaceMergeSorter() {
@Override @Override
@ -180,6 +174,16 @@ public class AndDocIdSet extends DocIdSet {
this.otherIterators = Arrays.copyOfRange(sortedIterators, 1, sortedIterators.length); this.otherIterators = Arrays.copyOfRange(sortedIterators, 1, sortedIterators.length);
} }
@Override
public boolean isBroken() {
for (DocIdSetIterator it : Iterables.concat(Collections.singleton(lead), Arrays.asList(otherIterators))) {
if (DocIdSets.isBroken(it)) {
return true;
}
}
return false;
}
@Override @Override
public final int docID() { public final int docID() {
return doc; return doc;

View File

@ -20,9 +20,10 @@
package org.elasticsearch.common.lucene.docset; package org.elasticsearch.common.lucene.docset;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.BitsFilteredDocIdSet;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesDocIdSet;
import org.apache.lucene.search.FilteredDocIdSetIterator;
import org.apache.lucene.util.BitDocIdSet; import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
@ -30,6 +31,7 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.RoaringDocIdSet; import org.apache.lucene.util.RoaringDocIdSet;
import org.apache.lucene.util.SparseFixedBitSet; import org.apache.lucene.util.SparseFixedBitSet;
import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.lucene.search.XDocIdSetIterator;
import java.io.IOException; import java.io.IOException;
@ -52,15 +54,28 @@ public class DocIdSets {
} }
/** /**
* Is {@link org.apache.lucene.search.DocIdSetIterator} implemented in a "fast" manner. * Check if the given iterator can nextDoc() or advance() in sub-linear time
* For example, it does not ends up iterating one doc at a time check for its "value". * of the number of documents. For instance, an iterator that would need to
* iterate one document at a time to check for its value would be considered
* broken.
*/ */
public static boolean isFastIterator(DocIdSet set) { public static boolean isBroken(DocIdSetIterator iterator) {
// TODO: this is really horrible while (iterator instanceof FilteredDocIdSetIterator) {
while (set instanceof BitsFilteredDocIdSet) { // this iterator is filtered (likely by some bits)
set = ((BitsFilteredDocIdSet) set).getDelegate(); // unwrap in order to check if the underlying iterator is fast
iterator = ((FilteredDocIdSetIterator) iterator).getDelegate();
} }
return set instanceof BitDocIdSet || set instanceof RoaringDocIdSet; if (iterator instanceof XDocIdSetIterator) {
return ((XDocIdSetIterator) iterator).isBroken();
}
if (iterator instanceof MatchDocIdSetIterator) {
return true;
}
// DocValuesDocIdSet produces anonymous slow iterators
if (iterator != null && DocValuesDocIdSet.class.equals(iterator.getClass().getEnclosingClass())) {
return true;
}
return false;
} }
/** /**

View File

@ -23,9 +23,9 @@ import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.lucene.search.XDocIdSetIterator;
import java.io.IOException; import java.io.IOException;
import java.util.Collection;
/** /**
* *
@ -99,7 +99,7 @@ public class OrDocIdSet extends DocIdSet {
} }
} }
static class IteratorBasedIterator extends DocIdSetIterator { static class IteratorBasedIterator extends XDocIdSetIterator {
final class Item { final class Item {
public final DocIdSetIterator iter; public final DocIdSetIterator iter;
@ -115,23 +115,32 @@ public class OrDocIdSet extends DocIdSet {
private final Item[] _heap; private final Item[] _heap;
private int _size; private int _size;
private final long cost; private final long cost;
private final boolean broken;
IteratorBasedIterator(DocIdSet[] sets) throws IOException { IteratorBasedIterator(DocIdSet[] sets) throws IOException {
_curDoc = -1; _curDoc = -1;
_heap = new Item[sets.length]; _heap = new Item[sets.length];
_size = 0; _size = 0;
long cost = 0; long cost = 0;
boolean broken = false;
for (DocIdSet set : sets) { for (DocIdSet set : sets) {
DocIdSetIterator iterator = set.iterator(); DocIdSetIterator iterator = set.iterator();
broken |= DocIdSets.isBroken(iterator);
if (iterator != null) { if (iterator != null) {
_heap[_size++] = new Item(iterator); _heap[_size++] = new Item(iterator);
cost += iterator.cost(); cost += iterator.cost();
} }
} }
this.cost = cost; this.cost = cost;
this.broken = broken;
if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS; if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS;
} }
@Override
public boolean isBroken() {
return broken;
}
@Override @Override
public final int docID() { public final int docID() {
return _curDoc; return _curDoc;

View File

@ -124,29 +124,29 @@ public class XBooleanFilter extends Filter implements Iterable<FilterClause> {
// continue, but we recorded that there is at least one should clause // continue, but we recorded that there is at least one should clause
// so that if all iterators are null we know that nothing matches this // so that if all iterators are null we know that nothing matches this
// filter since at least one SHOULD clause needs to match // filter since at least one SHOULD clause needs to match
} else if (bits == null || DocIdSets.isFastIterator(set)) { } else if (bits != null && DocIdSets.isBroken(it)) {
shouldIterators.add(it);
} else {
shouldBits.add(bits); shouldBits.add(bits);
} else {
shouldIterators.add(it);
} }
break; break;
case MUST: case MUST:
if (it == null) { if (it == null) {
// no documents matched a clause that is compulsory, then nothing matches at all // no documents matched a clause that is compulsory, then nothing matches at all
return null; return null;
} else if (bits == null || DocIdSets.isFastIterator(set)) { } else if (bits != null && DocIdSets.isBroken(it)) {
requiredIterators.add(it);
} else {
requiredBits.add(bits); requiredBits.add(bits);
} else {
requiredIterators.add(it);
} }
break; break;
case MUST_NOT: case MUST_NOT:
if (it == null) { if (it == null) {
// ignore // ignore
} else if (bits == null || DocIdSets.isFastIterator(set)) { } else if (bits != null && DocIdSets.isBroken(it)) {
excludedIterators.add(it);
} else {
excludedBits.add(bits); excludedBits.add(bits);
} else {
excludedIterators.add(it);
} }
break; break;
default: default:

View File

@ -0,0 +1,40 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.search;
import org.apache.lucene.search.DocIdSetIterator;
import org.elasticsearch.common.lucene.docset.DocIdSets;
/**
* Extension of {@link DocIdSetIterator} that allows to know if iteration is
* implemented efficiently.
*/
public abstract class XDocIdSetIterator extends DocIdSetIterator {
/**
* Return <tt>true</tt> if this iterator cannot both
* {@link DocIdSetIterator#nextDoc} and {@link DocIdSetIterator#advance}
* in sub-linear time.
*
* Do not call this method directly, use {@link DocIdSets#isBroken}.
*/
public abstract boolean isBroken();
}

View File

@ -76,8 +76,8 @@ public class FilteredQueryParser implements QueryParser {
@Override @Override
public Scorer filteredScorer(LeafReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { public Scorer filteredScorer(LeafReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException {
// CHANGE: If threshold is 0, always pass down the accept docs, don't pay the price of calling nextDoc even... // CHANGE: If threshold is 0, always pass down the accept docs, don't pay the price of calling nextDoc even...
if (threshold == 0) {
final Bits filterAcceptDocs = docIdSet.bits(); final Bits filterAcceptDocs = docIdSet.bits();
if (threshold == 0) {
if (filterAcceptDocs != null) { if (filterAcceptDocs != null) {
return weight.scorer(context, filterAcceptDocs); return weight.scorer(context, filterAcceptDocs);
} else { } else {
@ -88,7 +88,8 @@ public class FilteredQueryParser implements QueryParser {
// CHANGE: handle "default" value // CHANGE: handle "default" value
if (threshold == -1) { if (threshold == -1) {
// default value, don't iterate on only apply filter after query if its not a "fast" docIdSet // default value, don't iterate on only apply filter after query if its not a "fast" docIdSet
if (!DocIdSets.isFastIterator(docIdSet)) { // TODO: is there a way we could avoid creating an iterator here?
if (filterAcceptDocs != null && DocIdSets.isBroken(docIdSet.iterator())) {
return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet); return FilteredQuery.QUERY_FIRST_FILTER_STRATEGY.filteredScorer(context, weight, docIdSet);
} }
} }

View File

@ -0,0 +1,122 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.docset;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.DistanceUnit;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.elasticsearch.index.engine.Engine.Searcher;
import org.elasticsearch.index.query.FilterBuilder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.TermFilterBuilder;
import org.elasticsearch.index.service.IndexService;
import org.elasticsearch.test.ElasticsearchSingleNodeTest;
public class DocIdSetsTests extends ElasticsearchSingleNodeTest {
private static final Settings SINGLE_SHARD_SETTINGS = ImmutableSettings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).build();
private void test(IndexService indexService, boolean broken, FilterBuilder filterBuilder) throws IOException {
client().admin().indices().prepareRefresh("test").get();
XContentBuilder builder = filterBuilder.toXContent(JsonXContent.contentBuilder(), ToXContent.EMPTY_PARAMS);
XContentParser parser = JsonXContent.jsonXContent.createParser(builder.bytes());
Filter filter = indexService.queryParserService().parseInnerFilter(parser).filter();
try (Searcher searcher = indexService.shardSafe(0).acquireSearcher("test")) {
final LeafReaderContext ctx = searcher.reader().leaves().get(0);
DocIdSet set = filter.getDocIdSet(ctx, null);
assertEquals(broken, DocIdSets.isBroken(set.iterator()));
}
}
public void testTermIsNotBroken() throws IOException {
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long");
client().prepareIndex("test", "type").setSource("l", 7).get();
TermFilterBuilder filter = FilterBuilders.termFilter("l", 7).cache(randomBoolean());
test(indexService, false, filter);
}
public void testDefaultGeoIsBroken() throws IOException {
// Geo is slow by default :'(
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point");
client().prepareIndex("test", "type").setSource("gp", "2,3").get();
FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2);
test(indexService, true, filter);
}
public void testIndexedGeoIsNotBroken() throws IOException {
// Geo has a fast iterator when indexing lat,lon and using the "indexed" bbox optimization
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "gp", "type=geo_point,lat_lon=true");
client().prepareIndex("test", "type").setSource("gp", "2,3").get();
FilterBuilder filter = FilterBuilders.geoDistanceFilter("gp").distance(1000, DistanceUnit.KILOMETERS).point(3, 2).optimizeBbox("indexed");
test(indexService, false, filter);
}
public void testScriptIsBroken() throws IOException { // by nature unfortunately
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long");
client().prepareIndex("test", "type").setSource("l", 7).get();
FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8");
test(indexService, true, filter);
}
public void testCachedIsNotBroken() throws IOException {
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long");
client().prepareIndex("test", "type").setSource("l", 7).get();
// This filter is inherently slow but by caching it we pay the price at caching time, not iteration
FilterBuilder filter = FilterBuilders.scriptFilter("doc['l'].value < 8").cache(true);
test(indexService, false, filter);
}
public void testOr() throws IOException {
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long");
client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get();
// Or with fast clauses is fast
FilterBuilder filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8));
test(indexService, false, filter);
// But if at least one clause is broken, it is broken
filter = FilterBuilders.orFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.scriptFilter("doc['l'].value < 8"));
test(indexService, true, filter);
}
public void testAnd() throws IOException {
IndexService indexService = createIndex("test", SINGLE_SHARD_SETTINGS, "type", "l", "type=long");
client().prepareIndex("test", "type").setSource("l", new long[] {7, 8}).get();
// And with fast clauses is fast
FilterBuilder filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7), FilterBuilders.termFilter("l", 8));
test(indexService, false, filter);
// If at least one clause is 'fast' and the other clauses supports random-access, it is still fast
filter = FilterBuilders.andFilter(FilterBuilders.termFilter("l", 7).cache(randomBoolean()), FilterBuilders.scriptFilter("doc['l'].value < 8"));
test(indexService, false, filter);
// However if all clauses are broken, the and is broken
filter = FilterBuilders.andFilter(FilterBuilders.scriptFilter("doc['l'].value > 5"), FilterBuilders.scriptFilter("doc['l'].value < 8"));
test(indexService, true, filter);
}
}