Aggs enhancement - allow Include/Exclude clauses to use array of terms as alternative to a regex

Closes #6782
2025-03-25 09:28:27 +00:00 · 2014-09-01 14:18:42 +01:00 · 2014-09-01 14:18:42 +01:00 · 3c8f8cc090
commit 3c8f8cc090
parent 3e589cd25b
5 changed files with 285 additions and 83 deletions
--- a/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/significantterms-aggregation.asciidoc
@ -449,67 +449,10 @@ WARNING: Use of background filters will slow the query as each term's postings m
 ===== Filtering Values

 It is possible (although rarely required) to filter the values for which buckets will be created. This can be done using the `include` and
-`exclude` parameters which are based on regular expressions. This functionality mirrors the features
-offered by the `terms` aggregation.
+`exclude` parameters which are based on a regular expression string or arrays of exact terms. This functionality mirrors the features
+described in the <<search-aggregations-bucket-terms-aggregation,terms aggregation>> documentation.


-[source,js]
--------------------------------------------------
-{
-    "aggs" : {
-        "tags" : {
-            "significant_terms" : {
-                "field" : "tags",
-                "include" : ".*sport.*",
-                "exclude" : "water_.*"
-            }
-        }
-    }
-}
--------------------------------------------------
-
-In the above example, buckets will be created for all the tags that has the word `sport` in them, except those starting
-with `water_` (so the tag `water_sports` will no be aggregated). The `include` regular expression will determine what
-values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
-both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.
-
-The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
-and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
-
-[source,js]
--------------------------------------------------
-{
-    "aggs" : {
-        "tags" : {
-             "terms" : {
-                 "field" : "tags",
-                 "include" : {
-                     "pattern" : ".*sport.*",
-                     "flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
-                 },
-                 "exclude" : {
-                     "pattern" : "water_.*",
-                     "flags" : "CANON_EQ|CASE_INSENSITIVE"
-                 }
-             }
-         }
-    }
-}
--------------------------------------------------
-
-<1> the flags are concatenated using the `|` character as a separator
-
-The possible flags that can be used are:
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
-http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
-
 ===== Execution hint

 There are two mechanisms by which terms aggregations can be executed: either by using field values directly in order to aggregate
--- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
+++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc
@ -418,7 +418,7 @@ Generating the terms using a script:
 ==== Filtering Values

 It is possible to filter the values for which buckets will be created. This can be done using the `include` and
-`exclude` parameters which are based on regular expressions.
+`exclude` parameters which are based on regular expression strings or arrays of exact values.

 [source,js]
 --------------------------------------------------
@ -477,6 +477,29 @@ http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CA
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
 http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]

+For matching based on exact values the `include` and `exclude` parameters can simply take an array of
+strings that represent the terms as they are found in the index:
+
+[source,js]
+--------------------------------------------------
+{
+    "aggs" : {
+        "JapaneseCars" : {
+             "terms" : {
+                 "field" : "make",
+                 "include" : ["mazda", "honda"]
+             }
+         },
+        "ActiveCarManufacturers" : {
+             "terms" : {
+                 "field" : "make",
+                 "exclude" : ["rover", "jensen"]
+             }
+         }
+    }
+}
+-------------------------------------------------- 
+
 ==== Multi-field terms aggregation

 The `terms` aggregation does not support collecting terms from multiple fields
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java
@ -19,6 +19,7 @@

 package org.elasticsearch.search.aggregations.bucket.terms;

+import org.elasticsearch.ElasticsearchIllegalArgumentException;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.Aggregator;
 import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
@ -43,6 +44,8 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
    private String executionHint;
    private SubAggCollectionMode collectionMode;
    private Boolean showTermDocCountError;
+    private String[] includeTerms = null;
+    private String[] excludeTerms = null;

    /**
     * Sole constructor.
@ -101,10 +104,24 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
     * @see java.util.regex.Pattern#compile(String, int)
     */
    public TermsBuilder include(String regex, int flags) {
+        if (includeTerms != null) {
+            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
+        }
        this.includePattern = regex;
        this.includeFlags = flags;
        return this;
    }
+    
+    /**
+     * Define a set of terms that should be aggregated.
+     */
+    public TermsBuilder include(String [] terms) {
+        if (includePattern != null) {
+            throw new ElasticsearchIllegalArgumentException("include clause must be an array of strings or a regex, not both");
+        }
+        this.includeTerms = terms;
+        return this;
+    }    

    /**
     * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
@ -123,10 +140,25 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
     * @see java.util.regex.Pattern#compile(String, int)
     */
    public TermsBuilder exclude(String regex, int flags) {
+        if (excludeTerms != null) {
+            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
+        }
        this.excludePattern = regex;
        this.excludeFlags = flags;
        return this;
    }
+    
+    /**
+     * Define a set of terms that should not be aggregated.
+     */
+    public TermsBuilder exclude(String [] terms) {
+        if (excludePattern != null) {
+            throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
+        }
+        this.excludeTerms = terms;
+        return this;
+    }    
+    

    /**
     * When using scripts, the value type indicates the types of the values the script is generating.
@ -189,6 +221,9 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
        if (collectionMode != null) {
            builder.field(Aggregator.COLLECT_MODE.getPreferredName(), collectionMode.parseField().getPreferredName());
        }
+        if (includeTerms != null) {
+            builder.array("include", includeTerms);
+        }
        if (includePattern != null) {
            if (includeFlags == 0) {
                builder.field("include", includePattern);
@ -199,6 +234,9 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
                        .endObject();
            }
        }
+        if (excludeTerms != null) {
+            builder.array("exclude", excludeTerms);
+        }
        if (excludePattern != null) {
            if (excludeFlags == 0) {
                builder.field("exclude", excludePattern);
--- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
+++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java
@ -21,8 +21,10 @@ package org.elasticsearch.search.aggregations.bucket.terms.support;
 import org.apache.lucene.index.RandomAccessOrds;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.LongBitSet;
+import org.elasticsearch.ElasticsearchParseException;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.common.xcontent.XContentParser;
@ -31,6 +33,8 @@ import org.elasticsearch.search.aggregations.support.ValuesSource;
 import org.elasticsearch.search.internal.SearchContext;

 import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -43,51 +47,127 @@ public class IncludeExclude {
    private final Matcher include;
    private final Matcher exclude;
    private final CharsRefBuilder scratch = new CharsRefBuilder();
+    private Set<BytesRef> includeValues;
+    private Set<BytesRef> excludeValues;
+    private final boolean hasRegexTest;

    /**
     * @param include   The regular expression pattern for the terms to be included
-     *                  (may only be {@code null} if {@code exclude} is not {@code null}
+     *                  (may only be {@code null} if one of the other arguments is none-null.
+     * @param includeValues   The terms to be included
+     *                  (may only be {@code null} if one of the other arguments is none-null.
     * @param exclude   The regular expression pattern for the terms to be excluded
-     *                  (may only be {@code null} if {@code include} is not {@code null}
+     *                  (may only be {@code null} if one of the other arguments is none-null.
+     * @param excludeValues   The terms to be excluded
+     *                  (may only be {@code null} if one of the other arguments is none-null.
     */
-    public IncludeExclude(Pattern include, Pattern exclude) {
-        assert include != null || exclude != null : "include & exclude cannot both be null"; // otherwise IncludeExclude object should be null
+    public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
+        assert includeValues != null || include != null || 
+                exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
        this.include = include != null ? include.matcher("") : null;
        this.exclude = exclude != null ? exclude.matcher("") : null;
+        hasRegexTest = include != null || exclude != null;
+        this.includeValues = includeValues;
+        this.excludeValues = excludeValues;
    }

    /**
     * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
     */
    public boolean accept(BytesRef value) {
-        scratch.copyUTF8Bytes(value);
-        if (include == null) {
-            // exclude must not be null
-            return !exclude.reset(scratch.get()).matches();
+
+        if (hasRegexTest) {
+            // We need to perform UTF8 to UTF16 conversion for use in the regex matching
+            scratch.copyUTF8Bytes(value);            
        }
-        if (!include.reset(scratch.get()).matches()) {
-            return false;
-        }
-        if (exclude == null) {
+        return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
+    }
+    
+    private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {
+
+        if ((includeValues == null) && (include == null)) {
+            // No include criteria to be tested.
            return true;
        }
-        return !exclude.reset(scratch.get()).matches();
+        
+        if (include != null) {
+            if (include.reset(scratch.get()).matches()) {
+                return true;
+            }
+        }
+        if (includeValues != null) {
+            if (includeValues.contains(value)) {
+                return true;
+            }
+        }
+        // Some include criteria was tested but no match found
+        return false;
+    }
+    
+    private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
+        if (exclude != null) {
+            if (exclude.reset(scratch.get()).matches()) {
+                return true;
+            }
+        }
+        if (excludeValues != null) {
+            if (excludeValues.contains(value)) {
+                return true;
+            }
+        }
+        // No exclude criteria was tested or no match found
+        return false;
    }

    /**
     * Computes which global ordinals are accepted by this IncludeExclude instance.
     */
    public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
-        TermsEnum globalTermsEnum = valueSource.globalOrdinalsValues().termsEnum();
        LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
-        try {
-            for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
-                if (accept(term)) {
-                    acceptedGlobalOrdinals.set(globalTermsEnum.ord());
+        // There are 3 ways of populating this bitset: 
+        // 1) Looking up the global ordinals for known "include" terms
+        // 2) Looking up the global ordinals for known "exclude" terms
+        // 3) Traversing the term enum for all terms and running past regexes
+        // Option 3 is known to be very slow in the case of high-cardinality fields and
+        // should be avoided if possible.
+        if (includeValues != null) {
+            // optimize for the case where the set of accepted values is a set
+            // of known terms, not a regex that would have to be tested against all terms in the index
+            for (BytesRef includeValue : includeValues) {
+                // We need to perform UTF8 to UTF16 conversion for use in the regex matching
+                scratch.copyUTF8Bytes(includeValue); 
+                if (!isExcluded(includeValue, scratch.get())) {
+                    long ord = globalOrdinals.lookupTerm(includeValue);
+                    if (ord >= 0) {
+                        acceptedGlobalOrdinals.set(ord);
+                    }
                }
            }
-        } catch (IOException e) {
-            throw ExceptionsHelper.convertToElastic(e);
+        } else {
+            if(hasRegexTest) {
+                // We have includeVals that are a regex or only regex excludes - we need to do the potentially 
+                // slow option of hitting termsEnum for every term in the index.
+                TermsEnum globalTermsEnum = valueSource.globalOrdinalsValues().termsEnum();
+                try {
+                    for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
+                        if (accept(term)) {
+                            acceptedGlobalOrdinals.set(globalTermsEnum.ord());
+                        }
+                    }
+                } catch (IOException e) {
+                    throw ExceptionsHelper.convertToElastic(e);
+                }
+            } else {
+                // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
+                acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
+                for (BytesRef excludeValue : excludeValues) {
+                    long ord = globalOrdinals.lookupTerm(excludeValue);
+                    if (ord >= 0) {
+                        acceptedGlobalOrdinals.clear(ord);
+                    }
+                }
+                
+            }
        }
        return acceptedGlobalOrdinals;
    }
@ -102,6 +182,8 @@ public class IncludeExclude {
        int includeFlags = 0; // 0 means no flags
        String exclude = null;
        int excludeFlags = 0; // 0 means no flags
+        Set<BytesRef> includeValues;
+        Set<BytesRef> excludeValues;

        public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
            this.aggName = aggName;
@ -121,6 +203,18 @@ public class IncludeExclude {
                }
                return true;
            }
+            
+            if (token == XContentParser.Token.START_ARRAY) {
+                if ("include".equals(currentFieldName)) {
+                     includeValues = parseArrayToSet(parser);
+                     return true;
+                } 
+                if ("exclude".equals(currentFieldName)) {
+                      excludeValues = parseArrayToSet(parser);
+                      return true;
+                }
+                return false;
+            }

            if (token == XContentParser.Token.START_OBJECT) {
                if ("include".equals(currentFieldName)) {
@ -163,14 +257,27 @@ public class IncludeExclude {

            return false;
        }
-
+        private Set<BytesRef> parseArrayToSet(XContentParser parser) throws IOException {
+            final Set<BytesRef> set = new HashSet<>();
+            if (parser.currentToken() != XContentParser.Token.START_ARRAY) {
+                throw new ElasticsearchParseException("Missing start of array in include/exclude clause");
+            }
+            while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
+                if (!parser.currentToken().isValue()) {
+                    throw new ElasticsearchParseException("Array elements in include/exclude clauses should be string values");
+                }
+                set.add(new BytesRef(parser.text()));
+            }
+            return set;
+        }
+        
        public IncludeExclude includeExclude() {
-            if (include == null && exclude == null) {
+            if (include == null && exclude == null && includeValues == null && excludeValues == null) {
                return null;
            }
            Pattern includePattern =  include != null ? Pattern.compile(include, includeFlags) : null;
            Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
-            return new IncludeExclude(includePattern, excludePattern);
+            return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
        }
    }

--- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
+++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java
@ -38,9 +38,11 @@ import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.hamcrest.Matchers;
 import org.junit.Test;

+import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.regex.Pattern;

 import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
@ -51,6 +53,7 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSear
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.core.IsNull.notNullValue;
+import static org.hamcrest.core.IsNull.nullValue;

 /**
 *
@ -336,6 +339,94 @@ public class StringTermsTests extends ElasticsearchIntegrationTest {
            assertThat(bucket.getDocCount(), equalTo(1l));
        }
    }
+    
+    
+    @Test
+    public void singleValueField_WithExactTermFiltering() throws Exception {
+        // include without exclude
+        String incVals[] = { "val000", "val001", "val002", "val003", "val004", "val005", "val006", "val007", "val008", "val009" };
+        SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .executionHint(randomExecutionHint())
+                        .field(SINGLE_VALUED_FIELD_NAME)
+                        .collectMode(randomFrom(SubAggCollectionMode.values()))
+                        .include(incVals))
+                .execute().actionGet();
+
+        assertSearchResponse(response);
+
+        Terms terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.getBuckets().size(), equalTo(incVals.length));
+
+        for (String incVal : incVals) {
+            Terms.Bucket bucket = terms.getBucketByKey(incVal);
+            assertThat(bucket, notNullValue());
+            assertThat(key(bucket), equalTo(incVal));
+            assertThat(bucket.getDocCount(), equalTo(1l));            
+        }
+
+        // include and exclude
+        // Slightly illogical example with exact terms below as include and exclude sets
+        // are made to overlap but the exclude set should have priority over matches.
+        // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
+        String excVals[] = { "val000", "val001" };
+
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .executionHint(randomExecutionHint())
+                        .field(SINGLE_VALUED_FIELD_NAME)
+                        .collectMode(randomFrom(SubAggCollectionMode.values()))
+                        .include(incVals)
+                        .exclude(excVals))
+                .execute().actionGet();
+
+        assertSearchResponse(response);
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.getBuckets().size(), equalTo(8));
+
+        for (int i = 2; i < 10; i++) {
+            Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
+            assertThat(bucket, notNullValue());
+            assertThat(key(bucket), equalTo("val00" + i));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+        
+        // Check case with only exact term exclude clauses 
+        response = client().prepareSearch("idx").setTypes("high_card_type")
+                .addAggregation(terms("terms")
+                        .executionHint(randomExecutionHint())
+                        .field(SINGLE_VALUED_FIELD_NAME)
+                        .collectMode(randomFrom(SubAggCollectionMode.values()))
+                        .exclude(excVals))
+                .execute().actionGet();
+
+        assertSearchResponse(response);
+
+        terms = response.getAggregations().get("terms");
+        assertThat(terms, notNullValue());
+        assertThat(terms.getName(), equalTo("terms"));
+        assertThat(terms.getBuckets().size(), equalTo(10));
+        for (String key : excVals) {
+            Terms.Bucket bucket = terms.getBucketByKey(key);
+            assertThat(bucket, nullValue());
+        }
+        NumberFormat nf=NumberFormat.getIntegerInstance(Locale.ENGLISH);
+        nf.setMinimumIntegerDigits(3);
+        for (int i = 2; i < 12; i++) {
+            Terms.Bucket bucket = terms.getBucketByKey("val" + nf.format(i));
+            assertThat(bucket, notNullValue());
+            assertThat(key(bucket), equalTo("val" + nf.format(i)));
+            assertThat(bucket.getDocCount(), equalTo(1l));
+        }
+        
+        
+    }
+    


    @Test