Merge pull request #10418 from jpountz/enhancement/speed_up_aggs_include_exclude

Aggregations: Speed up include/exclude in terms aggregations with regexps. Close #10418
2015-04-09 12:16:37 +02:00 · 2015-04-09 12:16:37 +02:00 · e25db222ee
parent 2a844fc457 aecd9ac515
commit e25db222ee
14 changed files with 337 additions and 329 deletions
--- a/docs/reference/migration/migrate_2_0.asciidoc
+++ b/docs/reference/migration/migrate_2_0.asciidoc
@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like
 being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we
 now always return dates and bucket keys in UTC.
 `include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple
 regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore.
 === Terms filter lookup caching
 The terms filter lookup mechanism does not support the `cache` option anymore
--- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r
 values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
 both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.
-The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
+The syntax is the same as <<regexp-syntax,regexp queries>>.
 and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
 [source,js]
 --------------------------------------------------
 {
    "aggs" : {
        "tags" : {
             "terms" : {
                 "field" : "tags",
                 "include" : {
                     "pattern" : ".*sport.*",
                     "flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
                 },
                 "exclude" : {
                     "pattern" : "water_.*",
                     "flags" : "CANON_EQ|CASE_INSENSITIVE"
                 }
             }
         }
    }
 }
 --------------------------------------------------
 <1> the flags are concatenated using the `|` character as a separator
 The possible flags that can be used are:
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
 For matching based on exact values the `include` and `exclude` parameters can simply take an array of
 strings that represent the terms as they are found in the index:
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
    public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
                                                    BucketCountThresholds bucketCountThresholds,
-                                                    IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
+                                                    IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
                                                    SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
        super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
        private final LongHash bucketOrds;
-        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
+        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
            super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData);
            bucketOrds = new LongHash(1, aggregationContext.bigArrays());
        }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
    public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
            BucketCountThresholds bucketCountThresholds,
-            IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
+            IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
            SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
        super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
-                return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
                return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }
        },
@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
                ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource;
                IndexSearcher indexSearcher = aggregationContext.searchContext().searcher();
-                return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
                return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }
        },
@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
                return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }
        };
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser {
                .scriptable(false)
                .formattable(true)
                .build();
-        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context);
+        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
        aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
        TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds();
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
@ -57,7 +57,7 @@ import java.util.Map;
 public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator {
    protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource;
-    protected final IncludeExclude includeExclude;
+    protected final IncludeExclude.OrdinalsFilter includeExclude;
    // TODO: cache the acceptedglobalValues per aggregation definition.
    // We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution.
@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
    public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
                                               Terms.Order order, BucketCountThresholds bucketCountThresholds,
-                                               IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
+                                               IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
        super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
        this.valuesSource = valuesSource;
        this.includeExclude = includeExclude;
@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
        private final LongHash bucketOrds;
        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
-                        Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext,
+                        Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext,
                        Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
            super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData);
            bucketOrds = new LongHash(1, aggregationContext.bigArrays());
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator {
    private final ValuesSource valuesSource;
    protected final BytesRefHash bucketOrds;
-    private final IncludeExclude includeExclude;
+    private final IncludeExclude.StringFilter includeExclude;
    public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
            Terms.Order order, BucketCountThresholds bucketCountThresholds,
-            IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
+            IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
        super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
        this.valuesSource = valuesSource;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
                return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }
            @Override
@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
                return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }
            @Override
@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
                return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }
            @Override
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
@ -19,6 +19,7 @@
 package org.elasticsearch.search.aggregations.bucket.terms;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
    private Terms.ValueType valueType;
    private Terms.Order order;
    private String includePattern;
    private int includeFlags;
    private String excludePattern;
    private int excludeFlags;
    private String executionHint;
    private SubAggCollectionMode collectionMode;
    private Boolean showTermDocCountError;
@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
    /**
     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
-     * on the {@link java.util.regex.Pattern} class.
+     * on the {@link RegExp} class.
     *
-     * @see #include(String, int)
+     * @see {@link RegExp#RegExp(String)}
     */
    public TermsBuilder include(String regex) {
        return include(regex, 0);
    }
    /**
     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
     * on the {@link java.util.regex.Pattern} class.
     *
     * @see java.util.regex.Pattern#compile(String, int)
     */
    public TermsBuilder include(String regex, int flags) {
        if (includeTerms != null) {
            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
        }
        this.includePattern = regex;
        this.includeFlags = flags;
        return this;
    }
@ -160,29 +148,18 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
        }
        return termsAsString;
    }    
    /**
     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
     * expression is based on the {@link java.util.regex.Pattern} class.
     *
     * @see #exclude(String, int)
     */
    public TermsBuilder exclude(String regex) {
        return exclude(regex, 0);
    }
    /**
     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
-     * expression is based on the {@link java.util.regex.Pattern} class.
+     * expression is based on the {@link RegExp} class.
     *
-     * @see java.util.regex.Pattern#compile(String, int)
+     * @see {@link RegExp#RegExp(String)}
     */
-    public TermsBuilder exclude(String regex, int flags) {
+    public TermsBuilder exclude(String regex) {
        if (excludeTerms != null) {
            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both");
        }
        this.excludePattern = regex;
        this.excludeFlags = flags;
        return this;
    }
@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
            builder.array("include", includeTerms);
        }
        if (includePattern != null) {
-            if (includeFlags == 0) {
+            builder.field("include", includePattern);
                builder.field("include", includePattern);
            } else {
                builder.startObject("include")
                        .field("pattern", includePattern)
                        .field("flags", includeFlags)
                        .endObject();
            }
        }
        if (excludeTerms != null) {
            builder.array("exclude", excludeTerms);
        }
        if (excludePattern != null) {
-            if (excludeFlags == 0) {
+            builder.field("exclude", excludePattern);
                builder.field("exclude", excludePattern);
            } else {
                builder.startObject("exclude")
                        .field("pattern", excludePattern)
                        .field("flags", excludeFlags)
                        .endObject();
            }
        }
        return builder;
    }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser {
    public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
        TermsParametersParser aggParser = new TermsParametersParser();
        ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build();
-        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context);
+        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
        aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
        List<OrderElement> orderElements = aggParser.getOrderElements();
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support;
 import com.carrotsearch.hppc.LongOpenHashSet;
 import com.carrotsearch.hppc.LongSet;
 import org.apache.lucene.index.RandomAccessOrds;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.*;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LongBitSet;
 import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.ByteRunAutomaton;
 import org.apache.lucene.util.automaton.CompiledAutomaton;
 import org.apache.lucene.util.automaton.Operations;
 import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
 import org.elasticsearch.search.internal.SearchContext;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
-import java.util.regex.Matcher;
+import java.util.SortedSet;
-import java.util.regex.Pattern;
+import java.util.TreeSet;
 /**
 * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
@ -43,8 +51,8 @@ import java.util.regex.Pattern;
 */
 public class IncludeExclude {
-    // The includeValue and excludeValue ByteRefs which are the result of the parsing 
+    // The includeValue and excludeValue ByteRefs which are the result of the parsing
-    // process are converted into a LongFilter when used on numeric fields 
+    // process are converted into a LongFilter when used on numeric fields
    // in the index.
    public static class LongFilter {
        private LongSet valids;
@ -72,152 +80,145 @@ public class IncludeExclude {
        }
    }
-    private final Matcher include;
+    // Only used for the 'map' execution mode (ie. scripts)
-    private final Matcher exclude;
+    public static class StringFilter {
-    private final CharsRefBuilder scratch = new CharsRefBuilder();
+
-    private Set<BytesRef> includeValues;
+        private final ByteRunAutomaton runAutomaton;
-    private Set<BytesRef> excludeValues;
+
-    private final boolean hasRegexTest;
+        private StringFilter(Automaton automaton) {
            this.runAutomaton = new ByteRunAutomaton(automaton);
        }
        /**
         * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
         */
        public boolean accept(BytesRef value) {
            return runAutomaton.run(value.bytes, value.offset, value.length);
        }
    }
    public static class OrdinalsFilter {
        private final CompiledAutomaton compiled;
        private OrdinalsFilter(Automaton automaton) {
            this.compiled = new CompiledAutomaton(automaton);
        }
        /**
         * Computes which global ordinals are accepted by this IncludeExclude instance.
         */
        public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
            LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
            TermsEnum globalTermsEnum;
            Terms globalTerms = new DocValuesTerms(globalOrdinals);
            // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
            globalTermsEnum = compiled.getTermsEnum(globalTerms);
            for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
                acceptedGlobalOrdinals.set(globalTermsEnum.ord());
            }
            return acceptedGlobalOrdinals;
        }
    }
    private final RegExp include, exclude;
    private final SortedSet<BytesRef> includeValues, excludeValues;
    /**
     * @param include   The regular expression pattern for the terms to be included
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param includeValues   The terms to be included
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param exclude   The regular expression pattern for the terms to be excluded
     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param excludeValues   The terms to be excluded
     *                  (may only be {@code null} if one of the other arguments is none-null.
     */
-    public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
+    public IncludeExclude(RegExp include, RegExp exclude) {
-        assert includeValues != null || include != null || 
+        if (include == null && exclude == null) {
-                exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
+            throw new IllegalArgumentException();
-        this.include = include != null ? include.matcher("") : null;
+        }
-        this.exclude = exclude != null ? exclude.matcher("") : null;
+        this.include = include;
-        hasRegexTest = include != null || exclude != null;
+        this.exclude = exclude;
        this.includeValues = null;
        this.excludeValues = null;
    }
    /**
     * @param includeValues   The terms to be included
     * @param excludeValues   The terms to be excluded
     */
    public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
        if (includeValues == null && excludeValues == null) {
            throw new IllegalArgumentException();
        }
        this.include = null;
        this.exclude = null;
        this.includeValues = includeValues;
        this.excludeValues = excludeValues;
    }
    /**
-     * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
+     * Terms adapter around doc values.
     */
-    public boolean accept(BytesRef value) {
+    private static class DocValuesTerms extends Terms {
-        if (hasRegexTest) {
+        private final SortedSetDocValues values;
            // We need to perform UTF8 to UTF16 conversion for use in the regex matching
            scratch.copyUTF8Bytes(value);            
        }
        return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
    }
    private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {
-        if ((includeValues == null) && (include == null)) {
+        DocValuesTerms(SortedSetDocValues values) {
-            // No include criteria to be tested.
+            this.values = values;
            return true;
        }
-        
+
-        if (include != null) {
+        @Override
-            if (include.reset(scratch.get()).matches()) {
+        public TermsEnum iterator(TermsEnum reuse) throws IOException {
-                return true;
+            return values.termsEnum();
            }
        }
-        if (includeValues != null) {
+
-            if (includeValues.contains(value)) {
+        @Override
-                return true;
+        public long size() throws IOException {
-            }
+            return -1;
        }
-        // Some include criteria was tested but no match found
+
-        return false;
+        @Override
-    }
+        public long getSumTotalTermFreq() throws IOException {
-    
+            return -1;
    private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
        if (exclude != null) {
            if (exclude.reset(scratch.get()).matches()) {
                return true;
            }
        }
-        if (excludeValues != null) {
+
-            if (excludeValues.contains(value)) {
+        @Override
-                return true;
+        public long getSumDocFreq() throws IOException {
-            }
+            return -1;
        }
-        // No exclude criteria was tested or no match found
+
-        return false;
+        @Override
        public int getDocCount() throws IOException {
            return -1;
        }
        @Override
        public boolean hasFreqs() {
            return false;
        }
        @Override
        public boolean hasOffsets() {
            return false;
        }
        @Override
        public boolean hasPositions() {
            return false;
        }
        @Override
        public boolean hasPayloads() {
            return false;
        }
    }
-    /**
+
     * Computes which global ordinals are accepted by this IncludeExclude instance.
     */
    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
        LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
        // There are 3 ways of populating this bitset: 
        // 1) Looking up the global ordinals for known "include" terms
        // 2) Looking up the global ordinals for known "exclude" terms
        // 3) Traversing the term enum for all terms and running past regexes
        // Option 3 is known to be very slow in the case of high-cardinality fields and
        // should be avoided if possible.
        if (includeValues != null) {
            // optimize for the case where the set of accepted values is a set
            // of known terms, not a regex that would have to be tested against all terms in the index
            for (BytesRef includeValue : includeValues) {
                // We need to perform UTF8 to UTF16 conversion for use in the regex matching
                scratch.copyUTF8Bytes(includeValue); 
                if (!isExcluded(includeValue, scratch.get())) {
                    long ord = globalOrdinals.lookupTerm(includeValue);
                    if (ord >= 0) {
                        acceptedGlobalOrdinals.set(ord);
                    }
                }
            }
        } else {
            if(hasRegexTest) {
                // We have includeVals that are a regex or only regex excludes - we need to do the potentially 
                // slow option of hitting termsEnum for every term in the index.
                TermsEnum globalTermsEnum = globalOrdinals.termsEnum();
                try {
                    for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
                        if (accept(term)) {
                            acceptedGlobalOrdinals.set(globalTermsEnum.ord());
                        }
                    }
                } catch (IOException e) {
                    throw ExceptionsHelper.convertToElastic(e);
                }
            } else {
                // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
                acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
                for (BytesRef excludeValue : excludeValues) {
                    long ord = globalOrdinals.lookupTerm(excludeValue);
                    if (ord >= 0) {
                        acceptedGlobalOrdinals.clear(ord);
                    }
                }
            }
        }
        return acceptedGlobalOrdinals;
    }
    public static class Parser {
        private final String aggName;
        private final InternalAggregation.Type aggType;
        private final SearchContext context;
        String include = null;
        int includeFlags = 0; // 0 means no flags
        String exclude = null;
-        int excludeFlags = 0; // 0 means no flags
+        SortedSet<BytesRef> includeValues;
-        Set<BytesRef> includeValues;
+        SortedSet<BytesRef> excludeValues;
        Set<BytesRef> excludeValues;
        public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
            this.aggName = aggName;
            this.aggType = aggType;
            this.context = context;
        }
        public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException {
@ -231,14 +232,14 @@ public class IncludeExclude {
                }
                return true;
            }
-            
+
            if (token == XContentParser.Token.START_ARRAY) {
                if ("include".equals(currentFieldName)) {
-                     includeValues = parseArrayToSet(parser);
+                     includeValues = new TreeSet<>(parseArrayToSet(parser));
                     return true;
-                } 
+                }
                if ("exclude".equals(currentFieldName)) {
-                      excludeValues = parseArrayToSet(parser);
+                      excludeValues = new TreeSet<>(parseArrayToSet(parser));
                      return true;
                }
                return false;
@ -252,12 +253,6 @@ public class IncludeExclude {
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                include = parser.text();
                            } else if ("flags".equals(currentFieldName)) {
                                includeFlags = Regex.flagsFromString(parser.text());
                            }
                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
                            if ("flags".equals(currentFieldName)) {
                                includeFlags = parser.intValue();
                            }
                        }
                    }
@ -268,12 +263,6 @@ public class IncludeExclude {
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                exclude = parser.text();
                            } else if ("flags".equals(currentFieldName)) {
                                excludeFlags = Regex.flagsFromString(parser.text());
                            }
                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
                            if ("flags".equals(currentFieldName)) {
                                excludeFlags = parser.intValue();
                            }
                        }
                    }
@ -298,19 +287,50 @@ public class IncludeExclude {
            }
            return set;
        }
-        
+
        public IncludeExclude includeExclude() {
-            if (include == null && exclude == null && includeValues == null && excludeValues == null) {
+            RegExp includePattern =  include != null ? new RegExp(include) : null;
            RegExp excludePattern = exclude != null ? new RegExp(exclude) : null;
            if (includePattern != null || excludePattern != null) {
                if (includeValues != null || excludeValues != null) {
                    throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both");
                }
                return new IncludeExclude(includePattern, excludePattern);
            } else if (includeValues != null || excludeValues != null) {
                return new IncludeExclude(includeValues, excludeValues);
            } else {
                return null;
            }
            Pattern includePattern =  include != null ? Pattern.compile(include, includeFlags) : null;
            Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
            return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
        }
    }
    public boolean isRegexBased() {
-        return hasRegexTest;
+        return include != null || exclude != null;
    }
    private Automaton toAutomaton() {
        Automaton a = null;
        if (include != null) {
            a = include.toAutomaton();
        } else if (includeValues != null) {
            a = Automata.makeStringUnion(includeValues);
        } else {
            a = Automata.makeAnyString();
        }
        if (exclude != null) {
            a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
        } else if (excludeValues != null) {
            a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
        }
        return a;
    }
    public StringFilter convertToStringFilter() {
        return new StringFilter(toAutomaton());
    }
    public OrdinalsFilter convertToOrdinalsFilter() {
        return new OrdinalsFilter(toAutomaton());
    }
    public LongFilter convertToLongFilter() {
@ -329,6 +349,7 @@ public class IncludeExclude {
        }
        return result;
    }
    public LongFilter convertToDoubleFilter() {
        int numValids = includeValues == null ? 0 : includeValues.size();
        int numInvalids = excludeValues == null ? 0 : excludeValues.size();
--- a/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java
+++ b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java
@ -0,0 +1,130 @@
 /*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.benchmark.search.aggregations;
 import org.apache.lucene.util.TestUtil;
 import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
 import org.elasticsearch.action.bulk.BulkRequestBuilder;
 import org.elasticsearch.action.bulk.BulkResponse;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.common.StopWatch;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.node.Node;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import static org.elasticsearch.client.Requests.createIndexRequest;
 import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
 import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
 import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
 import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
 public class IncludeExcludeAggregationSearchBenchmark {
    private static final Random R = new Random();
    private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName();
    private static final int NUM_DOCS = 10000000;
    private static final int BATCH = 100;
    private static final int WARM = 3;
    private static final int RUNS = 10;
    private static final int ITERS = 3;
    public static void main(String[] args) {
        Settings settings = settingsBuilder()
                .put("index.refresh_interval", "-1")
                .put(SETTING_NUMBER_OF_SHARDS, 1)
                .put(SETTING_NUMBER_OF_REPLICAS, 0)
                .build();
        Node[] nodes = new Node[1];
        for (int i = 0; i < nodes.length; i++) {
            nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME)
                    .settings(settingsBuilder().put(settings).put("name", "node" + i))
                    .node();
        }
        Node clientNode = nodeBuilder()
                .clusterName(CLUSTER_NAME)
                .settings(settingsBuilder().put(settings).put("name", "client")).client(true).node();
        Client client = clientNode.client();
        try {
            client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type",
                    jsonBuilder().startObject().startObject("type").startObject("properties")
                        .startObject("str")
                            .field("type", "string")
                            .field("index", "not_analyzed")
                        .endObject()
                    .endObject().endObject().endObject())).actionGet();
            System.out.println("Indexing " + NUM_DOCS + " documents");
            StopWatch stopWatch = new StopWatch().start();
            for (int i = 0; i < NUM_DOCS; ) {
                BulkRequestBuilder request = client.prepareBulk();
                for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) {
                    request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R)));
                    ++i;
                }
                BulkResponse response = request.execute().actionGet();
                if (response.hasFailures()) {
                    System.err.println("--> failures...");
                    System.err.println(response.buildFailureMessage());
                }
                if ((i % 100000) == 0) {
                    System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime());
                    stopWatch.start();
                }
            }
            client.admin().indices().prepareRefresh("index").execute().actionGet();
        } catch (Exception e) {
            System.out.println("Index already exists, skipping index creation");
        }
        ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet();
        if (clusterHealthResponse.isTimedOut()) {
            System.err.println("--> Timed out waiting for cluster health");
        }
        for (int i = 0; i < WARM + RUNS; ++i) {
            if (i >= WARM) {
                System.out.println("RUN " + (i - WARM));
            }
            long start = System.nanoTime();
            SearchResponse resp = null;
            for (int j = 0; j < ITERS; ++j) {
                resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet();
            }
            long end = System.nanoTime();
            if (i >= WARM) {
                System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS));
            }
        }
    }
 }
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests {
        }
    }
    @Test
    public void singleValueField_WithRegexFiltering_WithFlags() throws Exception {
        // include without exclude
        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
        // with case insensitive flag on the include regex
        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
                .addAggregation(terms("terms")
                        .executionHint(randomExecutionHint())
                        .field(SINGLE_VALUED_FIELD_NAME)
                        .collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE))
                .execute().actionGet();
        assertSearchResponse(response);
        Terms terms = response.getAggregations().get("terms");
        assertThat(terms, notNullValue());
        assertThat(terms.getName(), equalTo("terms"));
        assertThat(terms.getBuckets().size(), equalTo(10));
        for (int i = 0; i < 10; i++) {
            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
            assertThat(bucket, notNullValue());
            assertThat(key(bucket), equalTo("val00" + i));
            assertThat(bucket.getDocCount(), equalTo(1l));
        }
        // include and exclude
        // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
        // with multi-flag masking on the exclude regex
        response = client().prepareSearch("idx").setTypes("high_card_type")
                .addAggregation(terms("terms")
                        .executionHint(randomExecutionHint())
                        .field(SINGLE_VALUED_FIELD_NAME)
                        .collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS))
                .execute().actionGet();
        assertSearchResponse(response);
        terms = response.getAggregations().get("terms");
        assertThat(terms, notNullValue());
        assertThat(terms.getName(), equalTo("terms"));
        assertThat(terms.getBuckets().size(), equalTo(8));
        for (int i = 2; i < 10; i++) {
            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
            assertThat(bucket, notNullValue());
            assertThat(key(bucket), equalTo("val00" + i));
            assertThat(bucket.getDocCount(), equalTo(1l));
        }
        // exclude without include
        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
        // with a "no flag" flag
        response = client().prepareSearch("idx").setTypes("high_card_type")
                .addAggregation(terms("terms")
                        .executionHint(randomExecutionHint())
                        .field(SINGLE_VALUED_FIELD_NAME)
                        .collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0))
                .execute().actionGet();
        assertSearchResponse(response);
        terms = response.getAggregations().get("terms");
        assertThat(terms, notNullValue());
        assertThat(terms.getName(), equalTo("terms"));
        assertThat(terms.getBuckets().size(), equalTo(10));
        for (int i = 0; i < 10; i++) {
            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
            assertThat(bucket, notNullValue());
            assertThat(key(bucket), equalTo("val00" + i));
            assertThat(bucket.getDocCount(), equalTo(1l));
        }
    }
    @Test
    public void singleValueField_WithExactTermFiltering() throws Exception {
        // include without exclude