Aggregations improvement: exclude clauses with a medium/large number of clauses fail.

The underlying automaton-backed implementation throws an error if there are too many states.

This fix changes to using an implementation based on Set lookups for lists of excluded terms.
If the global-ordinals execution mode is in effect this implementation also addresses the slowness identified in issue 11181 which is caused by traversing the TermsEnum - instead the excluded terms’ global ordinals are looked up individually and unset the bits of acceptable terms. This is significantly faster.

Closes #11176
This commit is contained in:
markharwood 2015-05-15 17:03:16 +01:00
parent 1a967ce267
commit caf723570d
1 changed files with 83 additions and 6 deletions

View File

@ -37,6 +37,7 @@ import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals;
import java.io.IOException;
import java.util.HashSet;
@ -80,33 +81,65 @@ public class IncludeExclude {
}
// Only used for the 'map' execution mode (ie. scripts)
public static class StringFilter {
public abstract static class StringFilter {
public abstract boolean accept(BytesRef value);
}
static class AutomatonBackedStringFilter extends StringFilter {
private final ByteRunAutomaton runAutomaton;
private StringFilter(Automaton automaton) {
private AutomatonBackedStringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}
/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
@Override
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}
public static class OrdinalsFilter {
static class TermListBackedStringFilter extends StringFilter {
private final Set<BytesRef> valids;
private final Set<BytesRef> invalids;
public TermListBackedStringFilter(Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
this.valids = includeValues;
this.invalids = excludeValues;
}
/**
* Returns whether the given value is accepted based on the
* {@code include} & {@code exclude} sets.
*/
@Override
public boolean accept(BytesRef value) {
return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value)));
}
}
public static abstract class OrdinalsFilter {
public abstract LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException;
}
static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter {
private final CompiledAutomaton compiled;
private OrdinalsFilter(Automaton automaton) {
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
}
/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*
*/
@Override
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
@ -120,6 +153,43 @@ public class IncludeExclude {
}
}
static class TermListBackedOrdinalsFilter extends OrdinalsFilter {
private final SortedSet<BytesRef> includeValues;
private final SortedSet<BytesRef> excludeValues;
public TermListBackedOrdinalsFilter(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}
@Override
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
if(includeValues!=null){
for (BytesRef term : includeValues) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
} else {
// default to all terms being acceptable
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
}
if (excludeValues != null) {
for (BytesRef term : excludeValues) {
long ord = globalOrdinals.lookupTerm(term);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
}
}
}
return acceptedGlobalOrdinals;
}
}
private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
@ -325,11 +395,18 @@ public class IncludeExclude {
}
public StringFilter convertToStringFilter() {
return new StringFilter(toAutomaton());
if (isRegexBased()) {
return new AutomatonBackedStringFilter(toAutomaton());
}
return new TermListBackedStringFilter(includeValues, excludeValues);
}
public OrdinalsFilter convertToOrdinalsFilter() {
return new OrdinalsFilter(toAutomaton());
if (isRegexBased()) {
return new AutomatonBackedOrdinalsFilter(toAutomaton());
}
return new TermListBackedOrdinalsFilter(includeValues, excludeValues);
}
public LongFilter convertToLongFilter() {