diff --git a/docs/reference/migration/migrate_2_0.asciidoc b/docs/reference/migration/migrate_2_0.asciidoc index dee483e6e77..f875363bb8b 100644 --- a/docs/reference/migration/migrate_2_0.asciidoc +++ b/docs/reference/migration/migrate_2_0.asciidoc @@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we now always return dates and bucket keys in UTC. +`include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple +regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore. + === Terms filter lookup caching The terms filter lookup mechanism does not support the `cache` option anymore diff --git a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc index 38f6630628a..6b93e926cdd 100644 --- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc @@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. -The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern], -and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work: - -[source,js] --------------------------------------------------- -{ - "aggs" : { - "tags" : { - "terms" : { - "field" : "tags", - "include" : { - "pattern" : ".*sport.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" <1> - }, - "exclude" : { - "pattern" : "water_.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" - } - } - } - } -} --------------------------------------------------- - -<1> the flags are concatenated using the `|` character as a separator - -The possible flags that can be used are: -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`] +The syntax is the same as <>. For matching based on exact values the `include` and `exclude` parameters can simply take an array of strings that represent the terms as they are found in the index: diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java index fc8e5e4b7f7..49a7e56eefb 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java @@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, + IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData); @@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri private final LongHash bucketOrds; - public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { + public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData); bucketOrds = new LongHash(1, aggregationContext.bigArrays()); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index fb65fd7d6f8..532a71efae7 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator { public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, + IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index e6058471eb9..ef837cfab82 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { - return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(); + return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }, @@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource; IndexSearcher indexSearcher = aggregationContext.searchContext().searcher(); - return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }, @@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { - return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java index 7492b698018..28e0fb5a812 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java @@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser { .scriptable(false) .formattable(true) .build(); - IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context); + IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(); aggParser.parse(aggregationName, parser, context, vsParser, incExcParser); TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 70904ac2ca3..767f2d50926 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -57,7 +57,7 @@ import java.util.Map; public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator { protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource; - protected final IncludeExclude includeExclude; + protected final IncludeExclude.OrdinalsFilter includeExclude; // TODO: cache the acceptedglobalValues per aggregation definition. // We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution. @@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { + IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData); this.valuesSource = valuesSource; this.includeExclude = includeExclude; @@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final LongHash bucketOrds; public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, - Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, + Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData); bucketOrds = new LongHash(1, aggregationContext.bigArrays()); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 9d731a25529..d625e3b9954 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator { private final ValuesSource valuesSource; protected final BytesRefHash bucketOrds; - private final IncludeExclude includeExclude; + private final IncludeExclude.StringFilter includeExclude; public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { + IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData); this.valuesSource = valuesSource; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index 6fbbd306411..3fa99d2b7fd 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(); + return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override @@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override @@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java index 2243654cf3a..ea2e40587d4 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.terms; +import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode; @@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { private Terms.ValueType valueType; private Terms.Order order; private String includePattern; - private int includeFlags; private String excludePattern; - private int excludeFlags; private String executionHint; private SubAggCollectionMode collectionMode; private Boolean showTermDocCountError; @@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { /** * Define a regular expression that will determine what terms should be aggregated. The regular expression is based - * on the {@link java.util.regex.Pattern} class. + * on the {@link RegExp} class. * - * @see #include(String, int) + * @see {@link RegExp#RegExp(String)} */ public TermsBuilder include(String regex) { - return include(regex, 0); - } - - /** - * Define a regular expression that will determine what terms should be aggregated. The regular expression is based - * on the {@link java.util.regex.Pattern} class. - * - * @see java.util.regex.Pattern#compile(String, int) - */ - public TermsBuilder include(String regex, int flags) { if (includeTerms != null) { throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both"); } this.includePattern = regex; - this.includeFlags = flags; return this; } @@ -160,29 +148,18 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { } return termsAsString; } - - /** - * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular - * expression is based on the {@link java.util.regex.Pattern} class. - * - * @see #exclude(String, int) - */ - public TermsBuilder exclude(String regex) { - return exclude(regex, 0); - } /** * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular - * expression is based on the {@link java.util.regex.Pattern} class. + * expression is based on the {@link RegExp} class. * - * @see java.util.regex.Pattern#compile(String, int) + * @see {@link RegExp#RegExp(String)} */ - public TermsBuilder exclude(String regex, int flags) { + public TermsBuilder exclude(String regex) { if (excludeTerms != null) { throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both"); } this.excludePattern = regex; - this.excludeFlags = flags; return this; } @@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { builder.array("include", includeTerms); } if (includePattern != null) { - if (includeFlags == 0) { - builder.field("include", includePattern); - } else { - builder.startObject("include") - .field("pattern", includePattern) - .field("flags", includeFlags) - .endObject(); - } + builder.field("include", includePattern); } if (excludeTerms != null) { builder.array("exclude", excludeTerms); } if (excludePattern != null) { - if (excludeFlags == 0) { - builder.field("exclude", excludePattern); - } else { - builder.startObject("exclude") - .field("pattern", excludePattern) - .field("flags", excludeFlags) - .endObject(); - } + builder.field("exclude", excludePattern); } return builder; } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java index 01a5cc7bc2f..478309d1bc0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java @@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser { public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException { TermsParametersParser aggParser = new TermsParametersParser(); ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build(); - IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context); + IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(); aggParser.parse(aggregationName, parser, context, vsParser, incExcParser); List orderElements = aggParser.getOrderElements(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java index 6eff3fcedfc..e653ad853a9 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java @@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support; import com.carrotsearch.hppc.LongOpenHashSet; import com.carrotsearch.hppc.LongSet; + import org.apache.lucene.index.RandomAccessOrds; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongBitSet; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; +import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchParseException; -import org.elasticsearch.ExceptionsHelper; -import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.support.ValuesSource; -import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.HashSet; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.SortedSet; +import java.util.TreeSet; /** * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic, @@ -43,8 +51,8 @@ import java.util.regex.Pattern; */ public class IncludeExclude { - // The includeValue and excludeValue ByteRefs which are the result of the parsing - // process are converted into a LongFilter when used on numeric fields + // The includeValue and excludeValue ByteRefs which are the result of the parsing + // process are converted into a LongFilter when used on numeric fields // in the index. public static class LongFilter { private LongSet valids; @@ -72,152 +80,145 @@ public class IncludeExclude { } } - private final Matcher include; - private final Matcher exclude; - private final CharsRefBuilder scratch = new CharsRefBuilder(); - private Set includeValues; - private Set excludeValues; - private final boolean hasRegexTest; + // Only used for the 'map' execution mode (ie. scripts) + public static class StringFilter { + + private final ByteRunAutomaton runAutomaton; + + private StringFilter(Automaton automaton) { + this.runAutomaton = new ByteRunAutomaton(automaton); + } + + /** + * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. + */ + public boolean accept(BytesRef value) { + return runAutomaton.run(value.bytes, value.offset, value.length); + } + } + + public static class OrdinalsFilter { + + private final CompiledAutomaton compiled; + + private OrdinalsFilter(Automaton automaton) { + this.compiled = new CompiledAutomaton(automaton); + } + + /** + * Computes which global ordinals are accepted by this IncludeExclude instance. + */ + public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException { + LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); + TermsEnum globalTermsEnum; + Terms globalTerms = new DocValuesTerms(globalOrdinals); + // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits. + globalTermsEnum = compiled.getTermsEnum(globalTerms); + for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { + acceptedGlobalOrdinals.set(globalTermsEnum.ord()); + } + return acceptedGlobalOrdinals; + } + + } + + private final RegExp include, exclude; + private final SortedSet includeValues, excludeValues; /** * @param include The regular expression pattern for the terms to be included - * (may only be {@code null} if one of the other arguments is none-null. - * @param includeValues The terms to be included - * (may only be {@code null} if one of the other arguments is none-null. * @param exclude The regular expression pattern for the terms to be excluded - * (may only be {@code null} if one of the other arguments is none-null. - * @param excludeValues The terms to be excluded - * (may only be {@code null} if one of the other arguments is none-null. */ - public IncludeExclude(Pattern include, Pattern exclude, Set includeValues, Set excludeValues) { - assert includeValues != null || include != null || - exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null - this.include = include != null ? include.matcher("") : null; - this.exclude = exclude != null ? exclude.matcher("") : null; - hasRegexTest = include != null || exclude != null; + public IncludeExclude(RegExp include, RegExp exclude) { + if (include == null && exclude == null) { + throw new IllegalArgumentException(); + } + this.include = include; + this.exclude = exclude; + this.includeValues = null; + this.excludeValues = null; + } + + /** + * @param includeValues The terms to be included + * @param excludeValues The terms to be excluded + */ + public IncludeExclude(SortedSet includeValues, SortedSet excludeValues) { + if (includeValues == null && excludeValues == null) { + throw new IllegalArgumentException(); + } + this.include = null; + this.exclude = null; this.includeValues = includeValues; this.excludeValues = excludeValues; } /** - * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. + * Terms adapter around doc values. */ - public boolean accept(BytesRef value) { + private static class DocValuesTerms extends Terms { - if (hasRegexTest) { - // We need to perform UTF8 to UTF16 conversion for use in the regex matching - scratch.copyUTF8Bytes(value); - } - return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get()); - } - - private boolean isIncluded(BytesRef value, CharsRef utf16Chars) { + private final SortedSetDocValues values; - if ((includeValues == null) && (include == null)) { - // No include criteria to be tested. - return true; + DocValuesTerms(SortedSetDocValues values) { + this.values = values; } - - if (include != null) { - if (include.reset(scratch.get()).matches()) { - return true; - } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return values.termsEnum(); } - if (includeValues != null) { - if (includeValues.contains(value)) { - return true; - } + + @Override + public long size() throws IOException { + return -1; } - // Some include criteria was tested but no match found - return false; - } - - private boolean isExcluded(BytesRef value, CharsRef utf16Chars) { - if (exclude != null) { - if (exclude.reset(scratch.get()).matches()) { - return true; - } + + @Override + public long getSumTotalTermFreq() throws IOException { + return -1; } - if (excludeValues != null) { - if (excludeValues.contains(value)) { - return true; - } + + @Override + public long getSumDocFreq() throws IOException { + return -1; } - // No exclude criteria was tested or no match found - return false; + + @Override + public int getDocCount() throws IOException { + return -1; + } + + @Override + public boolean hasFreqs() { + return false; + } + + @Override + public boolean hasOffsets() { + return false; + } + + @Override + public boolean hasPositions() { + return false; + } + + @Override + public boolean hasPayloads() { + return false; + } + } - /** - * Computes which global ordinals are accepted by this IncludeExclude instance. - */ - public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) { - LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); - // There are 3 ways of populating this bitset: - // 1) Looking up the global ordinals for known "include" terms - // 2) Looking up the global ordinals for known "exclude" terms - // 3) Traversing the term enum for all terms and running past regexes - // Option 3 is known to be very slow in the case of high-cardinality fields and - // should be avoided if possible. - if (includeValues != null) { - // optimize for the case where the set of accepted values is a set - // of known terms, not a regex that would have to be tested against all terms in the index - for (BytesRef includeValue : includeValues) { - // We need to perform UTF8 to UTF16 conversion for use in the regex matching - scratch.copyUTF8Bytes(includeValue); - if (!isExcluded(includeValue, scratch.get())) { - long ord = globalOrdinals.lookupTerm(includeValue); - if (ord >= 0) { - acceptedGlobalOrdinals.set(ord); - } - } - } - } else { - if(hasRegexTest) { - // We have includeVals that are a regex or only regex excludes - we need to do the potentially - // slow option of hitting termsEnum for every term in the index. - TermsEnum globalTermsEnum = globalOrdinals.termsEnum(); - try { - for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { - if (accept(term)) { - acceptedGlobalOrdinals.set(globalTermsEnum.ord()); - } - } - } catch (IOException e) { - throw ExceptionsHelper.convertToElastic(e); - } - } else { - // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads - acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length()); - for (BytesRef excludeValue : excludeValues) { - long ord = globalOrdinals.lookupTerm(excludeValue); - if (ord >= 0) { - acceptedGlobalOrdinals.clear(ord); - } - } - - } - } - return acceptedGlobalOrdinals; - } + public static class Parser { - private final String aggName; - private final InternalAggregation.Type aggType; - private final SearchContext context; - String include = null; - int includeFlags = 0; // 0 means no flags String exclude = null; - int excludeFlags = 0; // 0 means no flags - Set includeValues; - Set excludeValues; - - public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) { - this.aggName = aggName; - this.aggType = aggType; - this.context = context; - } + SortedSet includeValues; + SortedSet excludeValues; public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException { @@ -231,14 +232,14 @@ public class IncludeExclude { } return true; } - + if (token == XContentParser.Token.START_ARRAY) { if ("include".equals(currentFieldName)) { - includeValues = parseArrayToSet(parser); + includeValues = new TreeSet<>(parseArrayToSet(parser)); return true; - } + } if ("exclude".equals(currentFieldName)) { - excludeValues = parseArrayToSet(parser); + excludeValues = new TreeSet<>(parseArrayToSet(parser)); return true; } return false; @@ -252,12 +253,6 @@ public class IncludeExclude { } else if (token == XContentParser.Token.VALUE_STRING) { if ("pattern".equals(currentFieldName)) { include = parser.text(); - } else if ("flags".equals(currentFieldName)) { - includeFlags = Regex.flagsFromString(parser.text()); - } - } else if (token == XContentParser.Token.VALUE_NUMBER) { - if ("flags".equals(currentFieldName)) { - includeFlags = parser.intValue(); } } } @@ -268,12 +263,6 @@ public class IncludeExclude { } else if (token == XContentParser.Token.VALUE_STRING) { if ("pattern".equals(currentFieldName)) { exclude = parser.text(); - } else if ("flags".equals(currentFieldName)) { - excludeFlags = Regex.flagsFromString(parser.text()); - } - } else if (token == XContentParser.Token.VALUE_NUMBER) { - if ("flags".equals(currentFieldName)) { - excludeFlags = parser.intValue(); } } } @@ -298,19 +287,50 @@ public class IncludeExclude { } return set; } - + public IncludeExclude includeExclude() { - if (include == null && exclude == null && includeValues == null && excludeValues == null) { + RegExp includePattern = include != null ? new RegExp(include) : null; + RegExp excludePattern = exclude != null ? new RegExp(exclude) : null; + if (includePattern != null || excludePattern != null) { + if (includeValues != null || excludeValues != null) { + throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both"); + } + return new IncludeExclude(includePattern, excludePattern); + } else if (includeValues != null || excludeValues != null) { + return new IncludeExclude(includeValues, excludeValues); + } else { return null; } - Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null; - Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null; - return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues); } } public boolean isRegexBased() { - return hasRegexTest; + return include != null || exclude != null; + } + + private Automaton toAutomaton() { + Automaton a = null; + if (include != null) { + a = include.toAutomaton(); + } else if (includeValues != null) { + a = Automata.makeStringUnion(includeValues); + } else { + a = Automata.makeAnyString(); + } + if (exclude != null) { + a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } else if (excludeValues != null) { + a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + return a; + } + + public StringFilter convertToStringFilter() { + return new StringFilter(toAutomaton()); + } + + public OrdinalsFilter convertToOrdinalsFilter() { + return new OrdinalsFilter(toAutomaton()); } public LongFilter convertToLongFilter() { @@ -329,6 +349,7 @@ public class IncludeExclude { } return result; } + public LongFilter convertToDoubleFilter() { int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); diff --git a/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java new file mode 100644 index 00000000000..7de95e102a5 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java @@ -0,0 +1,130 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.search.aggregations; + +import org.apache.lucene.util.TestUtil; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.node.Node; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import static org.elasticsearch.client.Requests.createIndexRequest; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; +import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; + +public class IncludeExcludeAggregationSearchBenchmark { + + private static final Random R = new Random(); + private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName(); + private static final int NUM_DOCS = 10000000; + private static final int BATCH = 100; + private static final int WARM = 3; + private static final int RUNS = 10; + private static final int ITERS = 3; + + public static void main(String[] args) { + Settings settings = settingsBuilder() + .put("index.refresh_interval", "-1") + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + .build(); + + Node[] nodes = new Node[1]; + for (int i = 0; i < nodes.length; i++) { + nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "node" + i)) + .node(); + } + + Node clientNode = nodeBuilder() + .clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "client")).client(true).node(); + + Client client = clientNode.client(); + + try { + client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type", + jsonBuilder().startObject().startObject("type").startObject("properties") + .startObject("str") + .field("type", "string") + .field("index", "not_analyzed") + .endObject() + .endObject().endObject().endObject())).actionGet(); + + System.out.println("Indexing " + NUM_DOCS + " documents"); + + StopWatch stopWatch = new StopWatch().start(); + for (int i = 0; i < NUM_DOCS; ) { + BulkRequestBuilder request = client.prepareBulk(); + for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) { + request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R))); + ++i; + } + BulkResponse response = request.execute().actionGet(); + if (response.hasFailures()) { + System.err.println("--> failures..."); + System.err.println(response.buildFailureMessage()); + } + if ((i % 100000) == 0) { + System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime()); + stopWatch.start(); + } + } + + client.admin().indices().prepareRefresh("index").execute().actionGet(); + } catch (Exception e) { + System.out.println("Index already exists, skipping index creation"); + } + + ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet(); + if (clusterHealthResponse.isTimedOut()) { + System.err.println("--> Timed out waiting for cluster health"); + } + + for (int i = 0; i < WARM + RUNS; ++i) { + if (i >= WARM) { + System.out.println("RUN " + (i - WARM)); + } + long start = System.nanoTime(); + SearchResponse resp = null; + for (int j = 0; j < ITERS; ++j) { + resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet(); + } + long end = System.nanoTime(); + if (i >= WARM) { + System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS)); + } + } + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java index 9590222a72b..3ef59e06a90 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java @@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests { } } - @Test - public void singleValueField_WithRegexFiltering_WithFlags() throws Exception { - - // include without exclude - // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009 - // with case insensitive flag on the include regex - - SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE)) - .execute().actionGet(); - - assertSearchResponse(response); - - Terms terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(10)); - - for (int i = 0; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - - // include and exclude - // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009 - // with multi-flag masking on the exclude regex - - response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS)) - .execute().actionGet(); - - assertSearchResponse(response); - - terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(8)); - - for (int i = 2; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - - // exclude without include - // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009 - // with a "no flag" flag - - response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0)) - .execute().actionGet(); - - assertSearchResponse(response); - - terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(10)); - - for (int i = 0; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - } - - @Test public void singleValueField_WithExactTermFiltering() throws Exception { // include without exclude