From caf723570d8e52c52b1f6bd378f197c0af338c13 Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 15 May 2015 17:03:16 +0100 Subject: [PATCH] Aggregations improvement: exclude clauses with a medium/large number of clauses fail. The underlying automaton-backed implementation throws an error if there are too many states. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fix changes to using an implementation based on Set lookups for lists of excluded terms. If the global-ordinals execution mode is in effect this implementation also addresses the slowness identified in issue 11181 which is caused by traversing the TermsEnum - instead the excluded terms’ global ordinals are looked up individually and unset the bits of acceptable terms. This is significantly faster. Closes #11176 --- .../bucket/terms/support/IncludeExclude.java | 89 +++++++++++++++++-- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java index fb255e45373..1eff5885b1c 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java @@ -37,6 +37,7 @@ import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.search.aggregations.support.ValuesSource; +import org.elasticsearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals; import java.io.IOException; import java.util.HashSet; @@ -80,33 +81,65 @@ public class IncludeExclude { } // Only used for the 'map' execution mode (ie. scripts) - public static class StringFilter { + public abstract static class StringFilter { + public abstract boolean accept(BytesRef value); + } + + static class AutomatonBackedStringFilter extends StringFilter { private final ByteRunAutomaton runAutomaton; - private StringFilter(Automaton automaton) { + private AutomatonBackedStringFilter(Automaton automaton) { this.runAutomaton = new ByteRunAutomaton(automaton); } /** * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. */ + @Override public boolean accept(BytesRef value) { return runAutomaton.run(value.bytes, value.offset, value.length); } } - public static class OrdinalsFilter { + static class TermListBackedStringFilter extends StringFilter { + + private final Set valids; + private final Set invalids; + + public TermListBackedStringFilter(Set includeValues, Set excludeValues) { + this.valids = includeValues; + this.invalids = excludeValues; + } + + /** + * Returns whether the given value is accepted based on the + * {@code include} & {@code exclude} sets. + */ + @Override + public boolean accept(BytesRef value) { + return ((valids == null) || (valids.contains(value))) && ((invalids == null) || (!invalids.contains(value))); + } + } + + public static abstract class OrdinalsFilter { + public abstract LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException; + + } + + static class AutomatonBackedOrdinalsFilter extends OrdinalsFilter { private final CompiledAutomaton compiled; - private OrdinalsFilter(Automaton automaton) { + private AutomatonBackedOrdinalsFilter(Automaton automaton) { this.compiled = new CompiledAutomaton(automaton); } /** * Computes which global ordinals are accepted by this IncludeExclude instance. + * */ + @Override public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException { LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); TermsEnum globalTermsEnum; @@ -120,6 +153,43 @@ public class IncludeExclude { } } + + static class TermListBackedOrdinalsFilter extends OrdinalsFilter { + + private final SortedSet includeValues; + private final SortedSet excludeValues; + + public TermListBackedOrdinalsFilter(SortedSet includeValues, SortedSet excludeValues) { + this.includeValues = includeValues; + this.excludeValues = excludeValues; + } + + @Override + public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, WithOrdinals valueSource) throws IOException { + LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); + if(includeValues!=null){ + for (BytesRef term : includeValues) { + long ord = globalOrdinals.lookupTerm(term); + if (ord >= 0) { + acceptedGlobalOrdinals.set(ord); + } + } + } else { + // default to all terms being acceptable + acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length()); + } + if (excludeValues != null) { + for (BytesRef term : excludeValues) { + long ord = globalOrdinals.lookupTerm(term); + if (ord >= 0) { + acceptedGlobalOrdinals.clear(ord); + } + } + } + return acceptedGlobalOrdinals; + } + + } private final RegExp include, exclude; private final SortedSet includeValues, excludeValues; @@ -325,11 +395,18 @@ public class IncludeExclude { } public StringFilter convertToStringFilter() { - return new StringFilter(toAutomaton()); + if (isRegexBased()) { + return new AutomatonBackedStringFilter(toAutomaton()); + } + return new TermListBackedStringFilter(includeValues, excludeValues); } public OrdinalsFilter convertToOrdinalsFilter() { - return new OrdinalsFilter(toAutomaton()); + + if (isRegexBased()) { + return new AutomatonBackedOrdinalsFilter(toAutomaton()); + } + return new TermListBackedOrdinalsFilter(includeValues, excludeValues); } public LongFilter convertToLongFilter() {