- Added support for term filtering based on include/exclude regex on the terms agg

- Added javadoc to the TermsBuilder Closes #4267
2013-11-29 10:05:12 +01:00 · 2013-11-29 10:05:12 +01:00 · 0d6a35b9a7
parent afb0d119e4
commit 0d6a35b9a7
14 changed files with 450 additions and 17 deletions
--- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
@ -175,3 +175,65 @@ Generating the terms using a script:
 }
 --------------------------------------------------

+
+==== Filtering Values
+
+It is possible to filter the values for which buckets will be created. This can be done using the `include` and
+`exclude` parameters which are based on regular expressions.
+
+[source,js]
+--------------------------------------------------
+{
+    "aggs" : {
+        "tags" : {
+            "terms" : {
+                "field" : "tags",
+                "include" : ".*sport.*",
+                "exclude" : "water_.*"
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+In the above example, buckets will be created for all the tags that has the word `sport` in them, except those starting
+with `water_` (so the tag `water_sports` will no be aggregated). The `include` regular expression will determine what
+values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
+both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.
+
+The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
+and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
+
+[source,js]
+--------------------------------------------------
+{
+    "aggs" : {
+        "tags" : {
+             "terms" : {
+                 "field" : "tags",
+                 "include" : {
+                     "pattern" : ".*sport.*",
+                     "flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
+                 },
+                 "exclude" : {
+                     "pattern" : "water_.*",
+                     "flags" : "CANON_EQ|CASE_INSENSITIVE"
+                 }
+             }
+         }
+    }
+}
+--------------------------------------------------
+
+<1> the flags are concatenated using the `|` character as a separator
+
+The possible flags that can be used are:
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
+http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java
@ -29,6 +29,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.AggregationStreams;
 import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.InternalAggregations;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;
 import org.elasticsearch.search.aggregations.support.numeric.ValueFormatter;
 import org.elasticsearch.search.aggregations.support.numeric.ValueFormatterStreams;

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java
@ -21,12 +21,13 @@ package org.elasticsearch.search.aggregations.bucket.terms;

 import org.elasticsearch.index.fielddata.DoubleValues;
 import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.InternalAggregations;
 import org.elasticsearch.search.aggregations.bucket.BucketsAggregator;
 import org.elasticsearch.search.aggregations.bucket.LongHash;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource;
-import org.elasticsearch.search.aggregations.AggregatorFactories;

 import java.io.IOException;
 import java.util.Arrays;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java
@ -27,6 +27,7 @@ import org.elasticsearch.common.xcontent.ToXContent;
 import org.elasticsearch.search.aggregations.Aggregations;
 import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.InternalAggregations;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;

 import java.util.*;

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java
@ -29,6 +29,7 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.AggregationStreams;
 import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.InternalAggregations;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;
 import org.elasticsearch.search.aggregations.support.numeric.ValueFormatter;
 import org.elasticsearch.search.aggregations.support.numeric.ValueFormatterStreams;

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java
@ -21,12 +21,13 @@ package org.elasticsearch.search.aggregations.bucket.terms;

 import org.elasticsearch.index.fielddata.LongValues;
 import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.InternalAggregations;
 import org.elasticsearch.search.aggregations.bucket.BucketsAggregator;
 import org.elasticsearch.search.aggregations.bucket.LongHash;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.numeric.NumericValuesSource;
-import org.elasticsearch.search.aggregations.AggregatorFactories;

 import java.io.IOException;
 import java.util.Arrays;
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java
@ -23,10 +23,12 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.elasticsearch.index.fielddata.BytesValues;
 import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.bucket.BucketsAggregator;
+import org.elasticsearch.search.aggregations.bucket.terms.support.BucketPriorityQueue;
+import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
-import org.elasticsearch.search.aggregations.AggregatorFactories;

 import java.io.IOException;
 import java.util.Arrays;
@ -45,15 +47,18 @@ public class StringTermsAggregator extends BucketsAggregator {
    private final int requiredSize;
    private final int shardSize;
    private final BytesRefHash bucketOrds;
+    private final IncludeExclude includeExclude;

    public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
-                                 InternalOrder order, int requiredSize, int shardSize, AggregationContext aggregationContext, Aggregator parent) {
+                                 InternalOrder order, int requiredSize, int shardSize,
+                                 IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent) {

        super(name, BucketAggregationMode.PER_BUCKET, factories, INITIAL_CAPACITY, aggregationContext, parent);
        this.valuesSource = valuesSource;
        this.order = order;
        this.requiredSize = requiredSize;
        this.shardSize = shardSize;
+        this.includeExclude = includeExclude;
        bucketOrds = new BytesRefHash();
    }

@ -70,6 +75,9 @@ public class StringTermsAggregator extends BucketsAggregator {

        for (int i = 0; i < valuesCount; ++i) {
            final BytesRef bytes = values.nextValue();
+            if (includeExclude != null && !includeExclude.accept(bytes)) {
+                continue;
+            }
            final int hash = values.currentValueHash();
            int bucketOrdinal = bucketOrds.add(bytes, hash);
            if (bucketOrdinal < 0) { // already seen
@ -122,3 +130,4 @@ public class StringTermsAggregator extends BucketsAggregator {
    }

 }
+
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java
@ -21,6 +21,7 @@ package org.elasticsearch.search.aggregations.bucket.terms;

 import org.elasticsearch.search.aggregations.AggregationExecutionException;
 import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
 import org.elasticsearch.search.aggregations.support.ValueSourceAggregatorFactory;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
@ -36,12 +37,14 @@ public class TermsAggregatorFactory extends ValueSourceAggregatorFactory {
    private final InternalOrder order;
    private final int requiredSize;
    private final int shardSize;
+    private final IncludeExclude includeExclude;

-    public TermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, InternalOrder order, int requiredSize, int shardSize) {
+    public TermsAggregatorFactory(String name, ValuesSourceConfig valueSourceConfig, InternalOrder order, int requiredSize, int shardSize, IncludeExclude includeExclude) {
        super(name, StringTerms.TYPE.name(), valueSourceConfig);
        this.order = order;
        this.requiredSize = requiredSize;
        this.shardSize = shardSize;
+        this.includeExclude = includeExclude;
    }

    @Override
@ -52,7 +55,12 @@ public class TermsAggregatorFactory extends ValueSourceAggregatorFactory {
    @Override
    protected Aggregator create(ValuesSource valuesSource, long expectedBucketsCount, AggregationContext aggregationContext, Aggregator parent) {
        if (valuesSource instanceof BytesValuesSource) {
-            return new StringTermsAggregator(name, factories, valuesSource, order, requiredSize, shardSize, aggregationContext, parent);
+            return new StringTermsAggregator(name, factories, valuesSource, order, requiredSize, shardSize, includeExclude, aggregationContext, parent);
+        }
+
+        if (includeExclude != null) {
+            throw new AggregationExecutionException("Aggregation [" + name + "] cannot support the include/exclude " +
+                    "settings as it can only be applied to string values");
        }

        if (valuesSource instanceof NumericValuesSource) {
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
@ -7,34 +7,96 @@ import java.io.IOException;
 import java.util.Locale;

 /**
- *
+ * Builds a {@code terms} aggregation
 */
 public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {

+
    private int size = -1;
    private int shardSize = -1;
    private Terms.ValueType valueType;
    private Terms.Order order;
+    private String includePattern;
+    private int includeFlags;
+    private String excludePattern;
+    private int excludeFlags;

    public TermsBuilder(String name) {
        super(name, "terms");
    }

+    /**
+     * Sets the size - indicating how many term buckets should be returned (defaults to 10)
+     */
    public TermsBuilder size(int size) {
        this.size = size;
        return this;
    }

+    /**
+     * Sets the shard_size - indicating the number of term buckets each shard will return to the coordinating node (the
+     * node that coordinates the search execution). The higher the shard size is, the more accurate the results are.
+     */
    public TermsBuilder shardSize(int shardSize) {
        this.shardSize = shardSize;
        return this;
    }

+    /**
+     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
+     * on the {@link java.util.regex.Pattern} class.
+     *
+     * @see #include(String, int)
+     */
+    public TermsBuilder include(String regex) {
+        return include(regex, 0);
+    }
+
+    /**
+     * Define a regular expression that will determine what terms should be aggregated. The regular expression is based
+     * on the {@link java.util.regex.Pattern} class.
+     *
+     * @see java.util.regex.Pattern#compile(String, int)
+     */
+    public TermsBuilder include(String regex, int flags) {
+        this.includePattern = regex;
+        this.includeFlags = flags;
+        return this;
+    }
+
+    /**
+     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
+     * expression is based on the {@link java.util.regex.Pattern} class.
+     *
+     * @see #exclude(String, int)
+     */
+    public TermsBuilder exclude(String regex) {
+        return exclude(regex, 0);
+    }
+
+    /**
+     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
+     * expression is based on the {@link java.util.regex.Pattern} class.
+     *
+     * @see java.util.regex.Pattern#compile(String, int)
+     */
+    public TermsBuilder exclude(String regex, int flags) {
+        this.excludePattern = regex;
+        this.excludeFlags = flags;
+        return this;
+    }
+
+    /**
+     * When using scripts, the value type indicates the types of the values the script is generating.
+     */
    public TermsBuilder valueType(Terms.ValueType valueType) {
        this.valueType = valueType;
        return this;
    }

+    /**
+     * Defines the order in which the buckets will be returned.
+     */
    public TermsBuilder order(Terms.Order order) {
        this.order = order;
        return this;
@ -55,6 +117,26 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
            builder.field("order");
            order.toXContent(builder, params);
        }
+        if (includePattern != null) {
+            if (includeFlags == 0) {
+                builder.field("include", includePattern);
+            } else {
+                builder.startObject("include")
+                        .field("pattern", includePattern)
+                        .field("flags", includeFlags)
+                        .endObject();
+            }
+        }
+        if (excludePattern != null) {
+            if (excludeFlags == 0) {
+                builder.field("exclude", excludePattern);
+            } else {
+                builder.startObject("exclude")
+                        .field("pattern", excludePattern)
+                        .field("flags", excludeFlags)
+                        .endObject();
+            }
+        }
        return builder;
    }
 }
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java
@ -19,6 +19,7 @@

 package org.elasticsearch.search.aggregations.bucket.terms;

+import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.index.fielddata.IndexFieldData;
 import org.elasticsearch.index.fielddata.IndexNumericFieldData;
@ -26,8 +27,10 @@ import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.core.DateFieldMapper;
 import org.elasticsearch.index.mapper.ip.IpFieldMapper;
 import org.elasticsearch.script.SearchScript;
+import org.elasticsearch.search.SearchParseException;
 import org.elasticsearch.search.aggregations.Aggregator;
 import org.elasticsearch.search.aggregations.AggregatorFactory;
+import org.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude;
 import org.elasticsearch.search.aggregations.support.FieldContext;
 import org.elasticsearch.search.aggregations.support.ValuesSource;
 import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
@ -39,6 +42,7 @@ import org.elasticsearch.search.internal.SearchContext;

 import java.io.IOException;
 import java.util.Map;
+import java.util.regex.Pattern;

 /**
 *
@ -50,9 +54,6 @@ public class TermsParser implements Aggregator.Parser {
        return StringTerms.TYPE.name();
    }

-    // TODO add support for shard_size (vs. size) a la terms facets
-    // TODO add support for term filtering (regexp/include/exclude) a la terms facets
-
    @Override
    public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {

@ -67,6 +68,10 @@ public class TermsParser implements Aggregator.Parser {
        boolean orderAsc = false;
        String format = null;
        boolean assumeUnique = false;
+        String include = null;
+        int includeFlags = 0; // 0 means no flags
+        String exclude = null;
+        int excludeFlags = 0; // 0 means no flags


        XContentParser.Token token;
@ -85,6 +90,10 @@ public class TermsParser implements Aggregator.Parser {
                    valueType = Terms.ValueType.resolveType(parser.text());
                } else if ("format".equals(currentFieldName)) {
                    format = parser.text();
+                } else if ("include".equals(currentFieldName)) {
+                    include = parser.text();
+                } else if ("exclude".equals(currentFieldName)) {
+                    exclude = parser.text();
                }
            } else if (token == XContentParser.Token.VALUE_BOOLEAN) {
                if ("script_values_unique".equals(currentFieldName)) {
@ -105,8 +114,45 @@ public class TermsParser implements Aggregator.Parser {
                            orderKey = parser.currentName();
                        } else if (token == XContentParser.Token.VALUE_STRING) {
                            String dir = parser.text();
-                            orderAsc = "asc".equalsIgnoreCase(dir);
-                            //TODO: do we want to throw a parse error if the alternative is not "desc"???
+                            if ("asc".equalsIgnoreCase(dir)) {
+                                orderAsc = true;
+                            } else if ("desc".equalsIgnoreCase(dir)) {
+                                orderAsc = false;
+                            } else {
+                                throw new SearchParseException(context, "Unknown terms order direction [" + dir + "] in terms aggregation [" + aggregationName + "]");
+                            }
+                        }
+                    }
+                } else if ("include".equals(currentFieldName)) {
+                    while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+                        if (token == XContentParser.Token.FIELD_NAME) {
+                            currentFieldName = parser.currentName();
+                        } else if (token == XContentParser.Token.VALUE_STRING) {
+                            if ("pattern".equals(currentFieldName)) {
+                                include = parser.text();
+                            } else if ("flags".equals(currentFieldName)) {
+                                includeFlags = Regex.flagsFromString(parser.text());
+                            }
+                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
+                            if ("flags".equals(currentFieldName)) {
+                                includeFlags = parser.intValue();
+                            }
+                        }
+                    }
+                } else if ("exclude".equals(currentFieldName)) {
+                    while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+                        if (token == XContentParser.Token.FIELD_NAME) {
+                            currentFieldName = parser.currentName();
+                        } else if (token == XContentParser.Token.VALUE_STRING) {
+                            if ("pattern".equals(currentFieldName)) {
+                                exclude = parser.text();
+                            } else if ("flags".equals(currentFieldName)) {
+                                excludeFlags = Regex.flagsFromString(parser.text());
+                            }
+                        } else if (token == XContentParser.Token.VALUE_NUMBER) {
+                            if ("flags".equals(currentFieldName)) {
+                                excludeFlags = parser.intValue();
+                            }
                        }
                    }
                }
@ -118,6 +164,13 @@ public class TermsParser implements Aggregator.Parser {
            shardSize = requiredSize;
        }

+        IncludeExclude includeExclude = null;
+        if (include != null || exclude != null) {
+            Pattern includePattern =  include != null ? Pattern.compile(include, includeFlags) : null;
+            Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
+            includeExclude = new IncludeExclude(includePattern, excludePattern);
+        }
+
        InternalOrder order = resolveOrder(orderKey, orderAsc);
        SearchScript searchScript = null;
        if (script != null) {
@ -139,14 +192,14 @@ public class TermsParser implements Aggregator.Parser {
            if (!assumeUnique) {
                config.ensureUnique(true);
            }
-            return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize);
+            return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize, includeExclude);
        }

        FieldMapper<?> mapper = context.smartNameFieldMapper(field);
        if (mapper == null) {
            ValuesSourceConfig<?> config = new ValuesSourceConfig<BytesValuesSource>(BytesValuesSource.class);
            config.unmapped(true);
-            return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize);
+            return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize, includeExclude);
        }
        IndexFieldData<?> indexFieldData = context.fieldData().getForField(mapper);

@ -188,7 +241,7 @@ public class TermsParser implements Aggregator.Parser {
            config.ensureUnique(true);
        }

-        return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize);
+        return new TermsAggregatorFactory(aggregationName, config, order, requiredSize, shardSize, includeExclude);
    }

    static InternalOrder resolveOrder(String key, boolean asc) {
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTermsAggregator.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTermsAggregator.java
@ -20,9 +20,9 @@
 package org.elasticsearch.search.aggregations.bucket.terms;

 import org.elasticsearch.search.aggregations.Aggregator;
+import org.elasticsearch.search.aggregations.AggregatorFactories;
 import org.elasticsearch.search.aggregations.InternalAggregation;
 import org.elasticsearch.search.aggregations.support.AggregationContext;
-import org.elasticsearch.search.aggregations.AggregatorFactories;

 import java.io.IOException;

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/BucketPriorityQueue.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/BucketPriorityQueue.java
@ -17,9 +17,10 @@
 * under the License.
 */

-package org.elasticsearch.search.aggregations.bucket.terms;
+package org.elasticsearch.search.aggregations.bucket.terms.support;

 import org.apache.lucene.util.PriorityQueue;
+import org.elasticsearch.search.aggregations.bucket.terms.Terms;

 import java.util.Comparator;

--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
@ -0,0 +1,68 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.aggregations.bucket.terms.support;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.UnicodeUtil;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
+ * exclusion has precedence, where the {@code include} is evaluated first and then the {@code exclude}.
+ */
+public class IncludeExclude {
+
+    private final Matcher include;
+    private final Matcher exclude;
+    private final CharsRef scratch = new CharsRef();
+
+    /**
+     * @param include   The regular expression pattern for the terms to be included
+     *                  (may only be {@code null} if {@code exclude} is not {@code null}
+     * @param exclude   The regular expression pattern for the terms to be excluded
+     *                  (may only be {@code null} if {@code include} is not {@code null}
+     */
+    public IncludeExclude(Pattern include, Pattern exclude) {
+        assert include != null || exclude != null : "include & exclude cannot both be null"; // otherwise IncludeExclude object should be null
+        this.include = include != null ? include.matcher("") : null;
+        this.exclude = exclude != null ? exclude.matcher("") : null;
+    }
+
+    /**
+     * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
+     */
+    public boolean accept(BytesRef value) {
+        UnicodeUtil.UTF8toUTF16(value, scratch);
+        if (include == null) {
+            // exclude must not be null
+            return !exclude.reset(scratch).matches();
+        }
+        if (!include.reset(scratch).matches()) {
+            return false;
+        }
+        if (exclude == null) {
+            return true;
+        }
+        return !exclude.reset(scratch).matches();
+    }
+}
--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
@ -34,6 +34,7 @@ import org.junit.Test;

 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.Pattern;

 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
 import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
@ -104,6 +105,150 @@ public class StringTermsTests extends ElasticsearchIntegrationTest {
        }
    }

+    @Test
+    public void singleValueField_WithRegexFiltering() throws Exception {
+
+        // include without exclude
+        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
+
+        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").include("val00.+"))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        Terms terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(10));
+
+        for (int i = 0; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+
+        // include and exclude
+        // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
+
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").include("val00.+").exclude("(val000|val001)"))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(8));
+
+        for (int i = 2; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+
+        // exclude without include
+        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
+
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").exclude("val0[1-9]+.+"))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(10));
+
+        for (int i = 0; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+    }
+
+    @Test
+    public void singleValueField_WithRegexFiltering_WithFlags() throws Exception {
+
+        // include without exclude
+        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
+        // with case insensitive flag on the include regex
+
+        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").include("VAL00.+", Pattern.CASE_INSENSITIVE))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        Terms terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(10));
+
+        for (int i = 0; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+
+        // include and exclude
+        // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
+        // with multi-flag masking on the exclude regex
+
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(8));
+
+        for (int i = 2; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+
+        // exclude without include
+        // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
+        // with a "no flag" flag
+
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .field("value").exclude("val0[1-9]+.+", 0))
+                .execute().actionGet();
+
+        assertThat(response.getFailedShards(), equalTo(0));
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.buckets().size(), equalTo(10));
+
+        for (int i = 0; i < 10; i++) {
+            Terms.Bucket bucket = terms.getByTerm("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(bucket.getKey().string(), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+    }
+
+
    @Test
    public void singleValueField_WithMaxSize() throws Exception {
        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")