Merge pull request #10418 from jpountz/enhancement/speed_up_aggs_include_exclude

Aggregations: Speed up include/exclude in terms aggregations with regexps. Close #10418
2015-04-09 12:16:37 +02:00 · 2015-04-09 12:16:37 +02:00 · e25db222ee
parent 2a844fc457 aecd9ac515
commit e25db222ee
14 changed files with 337 additions and 329 deletions
--- a/docs/reference/migration/migrate_2_0.asciidoc
+++ b/docs/reference/migration/migrate_2_0.asciidoc
@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like
 being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we
 now always return dates and bucket keys in UTC.

+`include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple
+regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore.
+
 === Terms filter lookup caching

 The terms filter lookup mechanism does not support the `cache` option anymore
--- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r
 values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
 both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.

-The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
-and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
-
-[source,js]
--------------------------------------------------
-{
-    "aggs" : {
-        "tags" : {
-             "terms" : {
-                 "field" : "tags",
-                 "include" : {
-                     "pattern" : ".*sport.*",
-                     "flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
-                 },
-                 "exclude" : {
-                     "pattern" : "water_.*",
-                     "flags" : "CANON_EQ|CASE_INSENSITIVE"
-                 }
-             }
-         }
-    }
-}
--------------------------------------------------
-
-<1> the flags are concatenated using the `|` character as a separator
-
-The possible flags that can be used are:
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
+The syntax is the same as <<regexp-syntax,regexp queries>>.

 For matching based on exact values the `include` and `exclude` parameters can simply take an array of
 strings that represent the terms as they are found in the index:
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java
@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri

    public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
                                                    BucketCountThresholds bucketCountThresholds,
-                                                    IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
+                                                    IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
                                                    SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {

        super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri

        private final LongHash bucketOrds;

-        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
+        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
            super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData);
            bucketOrds = new LongHash(1, aggregationContext.bigArrays());
        }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java
@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {

    public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
            BucketCountThresholds bucketCountThresholds,
-            IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
+            IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
            SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {

        super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java
@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
-                return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
+                return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }

        },
@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
                ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource;
                IndexSearcher indexSearcher = aggregationContext.searchContext().searcher();
-                return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
+                return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }

        },
@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
+                return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
            }
        };

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java
@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser {
                .scriptable(false)
                .formattable(true)
                .build();
-        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context);
+        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
        aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);

        TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds();
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java
@ -57,7 +57,7 @@ import java.util.Map;
 public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator {

    protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource;
-    protected final IncludeExclude includeExclude;
+    protected final IncludeExclude.OrdinalsFilter includeExclude;

    // TODO: cache the acceptedglobalValues per aggregation definition.
    // We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution.
@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr

    public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
                                               Terms.Order order, BucketCountThresholds bucketCountThresholds,
-                                               IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
+                                               IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
        super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
        this.valuesSource = valuesSource;
        this.includeExclude = includeExclude;
@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
        private final LongHash bucketOrds;

        public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
-                        Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext,
+                        Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext,
                        Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
            super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData);
            bucketOrds = new LongHash(1, aggregationContext.bigArrays());
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator {

    private final ValuesSource valuesSource;
    protected final BytesRefHash bucketOrds;
-    private final IncludeExclude includeExclude;
+    private final IncludeExclude.StringFilter includeExclude;

    public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
            Terms.Order order, BucketCountThresholds bucketCountThresholds,
-            IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
+            IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {

        super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
        this.valuesSource = valuesSource;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
+                return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }

            @Override
@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
+                return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }

            @Override
@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
            Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
                              Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
                              AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
-                return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
+                final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
+                return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
            }

            @Override
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
@ -19,6 +19,7 @@

 package org.elasticsearch.search.aggregations.bucket.terms;

+import org.apache.lucene.util.automaton.RegExp;
 import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
    private Terms.ValueType valueType;
    private Terms.Order order;
    private String includePattern;
-    private int includeFlags;
    private String excludePattern;
-    private int excludeFlags;
    private String executionHint;
    private SubAggCollectionMode collectionMode;
    private Boolean showTermDocCountError;
@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {

    /**
     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
-     * on the {@link java.util.regex.Pattern} class.
+     * on the {@link RegExp} class.
     *
-     * @see #include(String, int)
+     * @see {@link RegExp#RegExp(String)}
     */
    public TermsBuilder include(String regex) {
-        return include(regex, 0);
-    }
-
-    /**
-     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
-     * on the {@link java.util.regex.Pattern} class.
-     *
-     * @see java.util.regex.Pattern#compile(String, int)
-     */
-    public TermsBuilder include(String regex, int flags) {
        if (includeTerms != null) {
            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
        }
        this.includePattern = regex;
-        this.includeFlags = flags;
        return this;
    }
    
@ -160,29 +148,18 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
        }
        return termsAsString;
    }    
-    
-    /**
-     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
-     * expression is based on the {@link java.util.regex.Pattern} class.
-     *
-     * @see #exclude(String, int)
-     */
-    public TermsBuilder exclude(String regex) {
-        return exclude(regex, 0);
-    }

    /**
     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
-     * expression is based on the {@link java.util.regex.Pattern} class.
+     * expression is based on the {@link RegExp} class.
     *
-     * @see java.util.regex.Pattern#compile(String, int)
+     * @see {@link RegExp#RegExp(String)}
     */
-    public TermsBuilder exclude(String regex, int flags) {
+    public TermsBuilder exclude(String regex) {
        if (excludeTerms != null) {
            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both");
        }
        this.excludePattern = regex;
-        this.excludeFlags = flags;
        return this;
    }
    
@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
            builder.array("include", includeTerms);
        }
        if (includePattern != null) {
-            if (includeFlags == 0) {
-                builder.field("include", includePattern);
-            } else {
-                builder.startObject("include")
-                        .field("pattern", includePattern)
-                        .field("flags", includeFlags)
-                        .endObject();
-            }
+            builder.field("include", includePattern);
        }
        if (excludeTerms != null) {
            builder.array("exclude", excludeTerms);
        }
        if (excludePattern != null) {
-            if (excludeFlags == 0) {
-                builder.field("exclude", excludePattern);
-            } else {
-                builder.startObject("exclude")
-                        .field("pattern", excludePattern)
-                        .field("flags", excludeFlags)
-                        .endObject();
-            }
+            builder.field("exclude", excludePattern);
        }
        return builder;
    }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser {
    public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
        TermsParametersParser aggParser = new TermsParametersParser();
        ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build();
-        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context);
+        IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
        aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);

        List<OrderElement> orderElements = aggParser.getOrderElements();
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support;

 import com.carrotsearch.hppc.LongOpenHashSet;
 import com.carrotsearch.hppc.LongSet;
+
 import org.apache.lucene.index.RandomAccessOrds;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.*;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.ElasticsearchParseException;
-import org.elasticsearch.ExceptionsHelper;
-import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.xcontent.XContentParser;
-import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
-import org.elasticsearch.search.internal.SearchContext;

 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.SortedSet;
+import java.util.TreeSet;

 /**
 * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
@ -43,8 +51,8 @@ import java.util.regex.Pattern;
 */
 public class IncludeExclude {

-    // The includeValue and excludeValue ByteRefs which are the result of the parsing 
-    // process are converted into a LongFilter when used on numeric fields 
+    // The includeValue and excludeValue ByteRefs which are the result of the parsing
+    // process are converted into a LongFilter when used on numeric fields
    // in the index.
    public static class LongFilter {
        private LongSet valids;
@ -72,152 +80,145 @@ public class IncludeExclude {
        }
    }

-    private final Matcher include;
-    private final Matcher exclude;
-    private final CharsRefBuilder scratch = new CharsRefBuilder();
-    private Set<BytesRef> includeValues;
-    private Set<BytesRef> excludeValues;
-    private final boolean hasRegexTest;
+    // Only used for the 'map' execution mode (ie. scripts)
+    public static class StringFilter {
+
+        private final ByteRunAutomaton runAutomaton;
+
+        private StringFilter(Automaton automaton) {
+            this.runAutomaton = new ByteRunAutomaton(automaton);
+        }
+
+        /**
+         * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
+         */
+        public boolean accept(BytesRef value) {
+            return runAutomaton.run(value.bytes, value.offset, value.length);
+        }
+    }
+
+    public static class OrdinalsFilter {
+
+        private final CompiledAutomaton compiled;
+
+        private OrdinalsFilter(Automaton automaton) {
+            this.compiled = new CompiledAutomaton(automaton);
+        }
+
+        /**
+         * Computes which global ordinals are accepted by this IncludeExclude instance.
+         */
+        public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
+            LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
+            TermsEnum globalTermsEnum;
+            Terms globalTerms = new DocValuesTerms(globalOrdinals);
+            // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
+            globalTermsEnum = compiled.getTermsEnum(globalTerms);
+            for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
+                acceptedGlobalOrdinals.set(globalTermsEnum.ord());
+            }
+            return acceptedGlobalOrdinals;
+        }
+
+    }
+
+    private final RegExp include, exclude;
+    private final SortedSet<BytesRef> includeValues, excludeValues;

    /**
     * @param include   The regular expression pattern for the terms to be included
-     *                  (may only be {@code null} if one of the other arguments is none-null.
-     * @param includeValues   The terms to be included
-     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param exclude   The regular expression pattern for the terms to be excluded
-     *                  (may only be {@code null} if one of the other arguments is none-null.
-     * @param excludeValues   The terms to be excluded
-     *                  (may only be {@code null} if one of the other arguments is none-null.
     */
-    public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
-        assert includeValues != null || include != null || 
-                exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
-        this.include = include != null ? include.matcher("") : null;
-        this.exclude = exclude != null ? exclude.matcher("") : null;
-        hasRegexTest = include != null || exclude != null;
+    public IncludeExclude(RegExp include, RegExp exclude) {
+        if (include == null && exclude == null) {
+            throw new IllegalArgumentException();
+        }
+        this.include = include;
+        this.exclude = exclude;
+        this.includeValues = null;
+        this.excludeValues = null;
+    }
+
+    /**
+     * @param includeValues   The terms to be included
+     * @param excludeValues   The terms to be excluded
+     */
+    public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
+        if (includeValues == null && excludeValues == null) {
+            throw new IllegalArgumentException();
+        }
+        this.include = null;
+        this.exclude = null;
        this.includeValues = includeValues;
        this.excludeValues = excludeValues;
    }

    /**
-     * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
+     * Terms adapter around doc values.
     */
-    public boolean accept(BytesRef value) {
+    private static class DocValuesTerms extends Terms {

-        if (hasRegexTest) {
-            // We need to perform UTF8 to UTF16 conversion for use in the regex matching
-            scratch.copyUTF8Bytes(value);            
-        }
-        return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
-    }
-    
-    private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {
+        private final SortedSetDocValues values;

-        if ((includeValues == null) && (include == null)) {
-            // No include criteria to be tested.
-            return true;
+        DocValuesTerms(SortedSetDocValues values) {
+            this.values = values;
        }
-        
-        if (include != null) {
-            if (include.reset(scratch.get()).matches()) {
-                return true;
-            }
+
+        @Override
+        public TermsEnum iterator(TermsEnum reuse) throws IOException {
+            return values.termsEnum();
        }
-        if (includeValues != null) {
-            if (includeValues.contains(value)) {
-                return true;
-            }
+
+        @Override
+        public long size() throws IOException {
+            return -1;
        }
-        // Some include criteria was tested but no match found
-        return false;
-    }
-    
-    private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
-        if (exclude != null) {
-            if (exclude.reset(scratch.get()).matches()) {
-                return true;
-            }
+
+        @Override
+        public long getSumTotalTermFreq() throws IOException {
+            return -1;
        }
-        if (excludeValues != null) {
-            if (excludeValues.contains(value)) {
-                return true;
-            }
+
+        @Override
+        public long getSumDocFreq() throws IOException {
+            return -1;
        }
-        // No exclude criteria was tested or no match found
-        return false;
+
+        @Override
+        public int getDocCount() throws IOException {
+            return -1;
+        }
+
+        @Override
+        public boolean hasFreqs() {
+            return false;
+        }
+
+        @Override
+        public boolean hasOffsets() {
+            return false;
+        }
+
+        @Override
+        public boolean hasPositions() {
+            return false;
+        }
+
+        @Override
+        public boolean hasPayloads() {
+            return false;
+        }
+
    }

-    /**
-     * Computes which global ordinals are accepted by this IncludeExclude instance.
-     */
-    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
-        LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
-        // There are 3 ways of populating this bitset: 
-        // 1) Looking up the global ordinals for known "include" terms
-        // 2) Looking up the global ordinals for known "exclude" terms
-        // 3) Traversing the term enum for all terms and running past regexes
-        // Option 3 is known to be very slow in the case of high-cardinality fields and
-        // should be avoided if possible.
-        if (includeValues != null) {
-            // optimize for the case where the set of accepted values is a set
-            // of known terms, not a regex that would have to be tested against all terms in the index
-            for (BytesRef includeValue : includeValues) {
-                // We need to perform UTF8 to UTF16 conversion for use in the regex matching
-                scratch.copyUTF8Bytes(includeValue); 
-                if (!isExcluded(includeValue, scratch.get())) {
-                    long ord = globalOrdinals.lookupTerm(includeValue);
-                    if (ord >= 0) {
-                        acceptedGlobalOrdinals.set(ord);
-                    }
-                }
-            }
-        } else {
-            if(hasRegexTest) {
-                // We have includeVals that are a regex or only regex excludes - we need to do the potentially 
-                // slow option of hitting termsEnum for every term in the index.
-                TermsEnum globalTermsEnum = globalOrdinals.termsEnum();
-                try {
-                    for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
-                        if (accept(term)) {
-                            acceptedGlobalOrdinals.set(globalTermsEnum.ord());
-                        }
-                    }
-                } catch (IOException e) {
-                    throw ExceptionsHelper.convertToElastic(e);
-                }
-            } else {
-                // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
-                acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
-                for (BytesRef excludeValue : excludeValues) {
-                    long ord = globalOrdinals.lookupTerm(excludeValue);
-                    if (ord >= 0) {
-                        acceptedGlobalOrdinals.clear(ord);
-                    }
-                }
-                
-            }
-        }
-        return acceptedGlobalOrdinals;
-    }
+

    public static class Parser {

-        private final String aggName;
-        private final InternalAggregation.Type aggType;
-        private final SearchContext context;
-
        String include = null;
-        int includeFlags = 0; // 0 means no flags
        String exclude = null;
-        int excludeFlags = 0; // 0 means no flags
-        Set<BytesRef> includeValues;
-        Set<BytesRef> excludeValues;
-
-        public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
-            this.aggName = aggName;
-            this.aggType = aggType;
-            this.context = context;
-        }
+        SortedSet<BytesRef> includeValues;
+        SortedSet<BytesRef> excludeValues;

        public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException {

@ -231,14 +232,14 @@ public class IncludeExclude {
                }
                return true;
            }
-            
+
            if (token == XContentParser.Token.START_ARRAY) {
                if ("include".equals(currentFieldName)) {
-                     includeValues = parseArrayToSet(parser);
+                     includeValues = new TreeSet<>(parseArrayToSet(parser));
                     return true;
-                } 
+                }
                if ("exclude".equals(currentFieldName)) {
-                      excludeValues = parseArrayToSet(parser);
+                      excludeValues = new TreeSet<>(parseArrayToSet(parser));
                      return true;
                }
                return false;
@ -252,12 +253,6 @@ public class IncludeExclude {
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                include = parser.text();
-                            } else if ("flags".equals(currentFieldName)) {
-                                includeFlags = Regex.flagsFromString(parser.text());
-                            }
-                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
-                            if ("flags".equals(currentFieldName)) {
-                                includeFlags = parser.intValue();
                            }
                        }
                    }
@ -268,12 +263,6 @@ public class IncludeExclude {
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            if ("pattern".equals(currentFieldName)) {
                                exclude = parser.text();
-                            } else if ("flags".equals(currentFieldName)) {
-                                excludeFlags = Regex.flagsFromString(parser.text());
-                            }
-                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
-                            if ("flags".equals(currentFieldName)) {
-                                excludeFlags = parser.intValue();
                            }
                        }
                    }
@ -298,19 +287,50 @@ public class IncludeExclude {
            }
            return set;
        }
-        
+
        public IncludeExclude includeExclude() {
-            if (include == null && exclude == null && includeValues == null && excludeValues == null) {
+            RegExp includePattern =  include != null ? new RegExp(include) : null;
+            RegExp excludePattern = exclude != null ? new RegExp(exclude) : null;
+            if (includePattern != null || excludePattern != null) {
+                if (includeValues != null || excludeValues != null) {
+                    throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both");
+                }
+                return new IncludeExclude(includePattern, excludePattern);
+            } else if (includeValues != null || excludeValues != null) {
+                return new IncludeExclude(includeValues, excludeValues);
+            } else {
                return null;
            }
-            Pattern includePattern =  include != null ? Pattern.compile(include, includeFlags) : null;
-            Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
-            return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
        }
    }

    public boolean isRegexBased() {
-        return hasRegexTest;
+        return include != null || exclude != null;
+    }
+
+    private Automaton toAutomaton() {
+        Automaton a = null;
+        if (include != null) {
+            a = include.toAutomaton();
+        } else if (includeValues != null) {
+            a = Automata.makeStringUnion(includeValues);
+        } else {
+            a = Automata.makeAnyString();
+        }
+        if (exclude != null) {
+            a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+        } else if (excludeValues != null) {
+            a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+        }
+        return a;
+    }
+
+    public StringFilter convertToStringFilter() {
+        return new StringFilter(toAutomaton());
+    }
+
+    public OrdinalsFilter convertToOrdinalsFilter() {
+        return new OrdinalsFilter(toAutomaton());
    }

    public LongFilter convertToLongFilter() {
@ -329,6 +349,7 @@ public class IncludeExclude {
        }
        return result;
    }
+
    public LongFilter convertToDoubleFilter() {
        int numValids = includeValues == null ? 0 : includeValues.size();
        int numInvalids = excludeValues == null ? 0 : excludeValues.size();
--- a/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java
+++ b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java
@ -0,0 +1,130 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.benchmark.search.aggregations;
+
+import org.apache.lucene.util.TestUtil;
+import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.common.StopWatch;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.node.Node;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import static org.elasticsearch.client.Requests.createIndexRequest;
+import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
+import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
+import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
+import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
+
+public class IncludeExcludeAggregationSearchBenchmark {
+
+    private static final Random R = new Random();
+    private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName();
+    private static final int NUM_DOCS = 10000000;
+    private static final int BATCH = 100;
+    private static final int WARM = 3;
+    private static final int RUNS = 10;
+    private static final int ITERS = 3;
+
+    public static void main(String[] args) {
+        Settings settings = settingsBuilder()
+                .put("index.refresh_interval", "-1")
+                .put(SETTING_NUMBER_OF_SHARDS, 1)
+                .put(SETTING_NUMBER_OF_REPLICAS, 0)
+                .build();
+
+        Node[] nodes = new Node[1];
+        for (int i = 0; i < nodes.length; i++) {
+            nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME)
+                    .settings(settingsBuilder().put(settings).put("name", "node" + i))
+                    .node();
+        }
+
+        Node clientNode = nodeBuilder()
+                .clusterName(CLUSTER_NAME)
+                .settings(settingsBuilder().put(settings).put("name", "client")).client(true).node();
+
+        Client client = clientNode.client();
+
+        try {
+            client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type",
+                    jsonBuilder().startObject().startObject("type").startObject("properties")
+                        .startObject("str")
+                            .field("type", "string")
+                            .field("index", "not_analyzed")
+                        .endObject()
+                    .endObject().endObject().endObject())).actionGet();
+
+            System.out.println("Indexing " + NUM_DOCS + " documents");
+
+            StopWatch stopWatch = new StopWatch().start();
+            for (int i = 0; i < NUM_DOCS; ) {
+                BulkRequestBuilder request = client.prepareBulk();
+                for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) {
+                    request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R)));
+                    ++i;
+                }
+                BulkResponse response = request.execute().actionGet();
+                if (response.hasFailures()) {
+                    System.err.println("--> failures...");
+                    System.err.println(response.buildFailureMessage());
+                }
+                if ((i % 100000) == 0) {
+                    System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime());
+                    stopWatch.start();
+                }
+            }
+
+            client.admin().indices().prepareRefresh("index").execute().actionGet();
+        } catch (Exception e) {
+            System.out.println("Index already exists, skipping index creation");
+        }
+
+        ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet();
+        if (clusterHealthResponse.isTimedOut()) {
+            System.err.println("--> Timed out waiting for cluster health");
+        }
+
+        for (int i = 0; i < WARM + RUNS; ++i) {
+            if (i >= WARM) {
+                System.out.println("RUN " + (i - WARM));
+            }
+            long start = System.nanoTime();
+            SearchResponse resp = null;
+            for (int j = 0; j < ITERS; ++j) {
+                resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet();
+            }
+            long end = System.nanoTime();
+            if (i >= WARM) {
+                System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS));
+            }
+        }
+    }
+
+}
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests {
        }
    }

-    @Test
-    public void singleValueField_WithRegexFiltering_WithFlags() throws Exception {
-
-        // include without exclude
-        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
-        // with case insensitive flag on the include regex
-
-        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
-                .addAggregation(terms("terms")
-                        .executionHint(randomExecutionHint())
-                        .field(SINGLE_VALUED_FIELD_NAME)
-                        .collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE))
-                .execute().actionGet();
-
-        assertSearchResponse(response);
-
-        Terms terms = response.getAggregations().get("terms");
-        assertThat(terms, notNullValue());
-        assertThat(terms.getName(), equalTo("terms"));
-        assertThat(terms.getBuckets().size(), equalTo(10));
-
-        for (int i = 0; i < 10; i++) {
-            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
-            assertThat(bucket, notNullValue());
-            assertThat(key(bucket), equalTo("val00" + i));
-            assertThat(bucket.getDocCount(), equalTo(1l));
-        }
-
-        // include and exclude
-        // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
-        // with multi-flag masking on the exclude regex
-
-        response = client().prepareSearch("idx").setTypes("high_card_type")
-                .addAggregation(terms("terms")
-                        .executionHint(randomExecutionHint())
-                        .field(SINGLE_VALUED_FIELD_NAME)
-                        .collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS))
-                .execute().actionGet();
-
-        assertSearchResponse(response);
-
-        terms = response.getAggregations().get("terms");
-        assertThat(terms, notNullValue());
-        assertThat(terms.getName(), equalTo("terms"));
-        assertThat(terms.getBuckets().size(), equalTo(8));
-
-        for (int i = 2; i < 10; i++) {
-            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
-            assertThat(bucket, notNullValue());
-            assertThat(key(bucket), equalTo("val00" + i));
-            assertThat(bucket.getDocCount(), equalTo(1l));
-        }
-
-        // exclude without include
-        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
-        // with a "no flag" flag
-
-        response = client().prepareSearch("idx").setTypes("high_card_type")
-                .addAggregation(terms("terms")
-                        .executionHint(randomExecutionHint())
-                        .field(SINGLE_VALUED_FIELD_NAME)
-                        .collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0))
-                .execute().actionGet();
-
-        assertSearchResponse(response);
-
-        terms = response.getAggregations().get("terms");
-        assertThat(terms, notNullValue());
-        assertThat(terms.getName(), equalTo("terms"));
-        assertThat(terms.getBuckets().size(), equalTo(10));
-
-        for (int i = 0; i < 10; i++) {
-            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
-            assertThat(bucket, notNullValue());
-            assertThat(key(bucket), equalTo("val00" + i));
-            assertThat(bucket.getDocCount(), equalTo(1l));
-        }
-    }
-
-
    @Test
    public void singleValueField_WithExactTermFiltering() throws Exception {
        // include without exclude