From aecd9ac51527fa6ac57788c23527ec4e0dc32636 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 3 Apr 2015 18:04:22 +0200 Subject: [PATCH] Aggregations: Speed up include/exclude in terms aggregations with regexps. Today we check every regular expression eagerly against every possible term. This can be very slow if you have lots of unique terms, and even the bottleneck if your query is selective. This commit switches to Lucene regular expressions instead of Java (not exactly the same syntax yet most existing regular expressions should keep working) and uses the same logic as RegExpQuery to intersect the regular expression with the terms dictionary. I wrote a quick benchmark (in the PR) to make sure it made things faster and the same request that took 750ms on master now takes 74ms with this change. Close #7526 --- docs/reference/migration/migrate_2_0.asciidoc | 3 + .../bucket/terms-aggregation.asciidoc | 37 +- ...balOrdinalsSignificantTermsAggregator.java | 4 +- .../SignificantStringTermsAggregator.java | 2 +- .../SignificantTermsAggregatorFactory.java | 9 +- .../significant/SignificantTermsParser.java | 2 +- .../GlobalOrdinalsStringTermsAggregator.java | 6 +- .../bucket/terms/StringTermsAggregator.java | 4 +- .../bucket/terms/TermsAggregatorFactory.java | 9 +- .../bucket/terms/TermsBuilder.java | 53 +-- .../bucket/terms/TermsParser.java | 2 +- .../bucket/terms/support/IncludeExclude.java | 325 ++++++++++-------- ...ludeExcludeAggregationSearchBenchmark.java | 130 +++++++ .../aggregations/bucket/StringTermsTests.java | 80 ----- 14 files changed, 337 insertions(+), 329 deletions(-) create mode 100644 src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java diff --git a/docs/reference/migration/migrate_2_0.asciidoc b/docs/reference/migration/migrate_2_0.asciidoc index dee483e6e77..f875363bb8b 100644 --- a/docs/reference/migration/migrate_2_0.asciidoc +++ b/docs/reference/migration/migrate_2_0.asciidoc @@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we now always return dates and bucket keys in UTC. +`include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple +regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore. + === Terms filter lookup caching The terms filter lookup mechanism does not support the `cache` option anymore diff --git a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc index 38f6630628a..6b93e926cdd 100644 --- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc @@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`. -The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern], -and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work: - -[source,js] --------------------------------------------------- -{ - "aggs" : { - "tags" : { - "terms" : { - "field" : "tags", - "include" : { - "pattern" : ".*sport.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" <1> - }, - "exclude" : { - "pattern" : "water_.*", - "flags" : "CANON_EQ|CASE_INSENSITIVE" - } - } - } - } -} --------------------------------------------------- - -<1> the flags are concatenated using the `|` character as a separator - -The possible flags that can be used are: -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`], -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and -http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`] +The syntax is the same as <>. For matching based on exact values the `include` and `exclude` parameters can simply take an array of strings that represent the terms as they are found in the index: diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java index fc8e5e4b7f7..49a7e56eefb 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/GlobalOrdinalsSignificantTermsAggregator.java @@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, + IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData); @@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri private final LongHash bucketOrds; - public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { + public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData); bucketOrds = new LongHash(1, aggregationContext.bigArrays()); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java index fb65fd7d6f8..532a71efae7 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantStringTermsAggregator.java @@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator { public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, + IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map metaData) throws IOException { super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java index e6058471eb9..ef837cfab82 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsAggregatorFactory.java @@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { - return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(); + return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }, @@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource; IndexSearcher indexSearcher = aggregationContext.searchContext().searcher(); - return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }, @@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map metaData) throws IOException { - return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData); } }; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java index 7492b698018..28e0fb5a812 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTermsParser.java @@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser { .scriptable(false) .formattable(true) .build(); - IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context); + IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(); aggParser.parse(aggregationName, parser, context, vsParser, incExcParser); TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 70904ac2ca3..767f2d50926 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -57,7 +57,7 @@ import java.util.Map; public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator { protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource; - protected final IncludeExclude includeExclude; + protected final IncludeExclude.OrdinalsFilter includeExclude; // TODO: cache the acceptedglobalValues per aggregation definition. // We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution. @@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { + IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData); this.valuesSource = valuesSource; this.includeExclude = includeExclude; @@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr private final LongHash bucketOrds; public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, - Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, + Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData); bucketOrds = new LongHash(1, aggregationContext.bigArrays()); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 9d731a25529..d625e3b9954 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator { private final ValuesSource valuesSource; protected final BytesRefHash bucketOrds; - private final IncludeExclude includeExclude; + private final IncludeExclude.StringFilter includeExclude; public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, - IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { + IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData); this.valuesSource = valuesSource; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index 6fbbd306411..3fa99d2b7fd 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter(); + return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override @@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override @@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory metaData) throws IOException { - return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); + final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); + return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java index 2243654cf3a..ea2e40587d4 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsBuilder.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.aggregations.bucket.terms; +import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode; @@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { private Terms.ValueType valueType; private Terms.Order order; private String includePattern; - private int includeFlags; private String excludePattern; - private int excludeFlags; private String executionHint; private SubAggCollectionMode collectionMode; private Boolean showTermDocCountError; @@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { /** * Define a regular expression that will determine what terms should be aggregated. The regular expression is based - * on the {@link java.util.regex.Pattern} class. + * on the {@link RegExp} class. * - * @see #include(String, int) + * @see {@link RegExp#RegExp(String)} */ public TermsBuilder include(String regex) { - return include(regex, 0); - } - - /** - * Define a regular expression that will determine what terms should be aggregated. The regular expression is based - * on the {@link java.util.regex.Pattern} class. - * - * @see java.util.regex.Pattern#compile(String, int) - */ - public TermsBuilder include(String regex, int flags) { if (includeTerms != null) { throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both"); } this.includePattern = regex; - this.includeFlags = flags; return this; } @@ -160,29 +148,18 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { } return termsAsString; } - - /** - * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular - * expression is based on the {@link java.util.regex.Pattern} class. - * - * @see #exclude(String, int) - */ - public TermsBuilder exclude(String regex) { - return exclude(regex, 0); - } /** * Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular - * expression is based on the {@link java.util.regex.Pattern} class. + * expression is based on the {@link RegExp} class. * - * @see java.util.regex.Pattern#compile(String, int) + * @see {@link RegExp#RegExp(String)} */ - public TermsBuilder exclude(String regex, int flags) { + public TermsBuilder exclude(String regex) { if (excludeTerms != null) { throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both"); } this.excludePattern = regex; - this.excludeFlags = flags; return this; } @@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder { builder.array("include", includeTerms); } if (includePattern != null) { - if (includeFlags == 0) { - builder.field("include", includePattern); - } else { - builder.startObject("include") - .field("pattern", includePattern) - .field("flags", includeFlags) - .endObject(); - } + builder.field("include", includePattern); } if (excludeTerms != null) { builder.array("exclude", excludeTerms); } if (excludePattern != null) { - if (excludeFlags == 0) { - builder.field("exclude", excludePattern); - } else { - builder.startObject("exclude") - .field("pattern", excludePattern) - .field("flags", excludeFlags) - .endObject(); - } + builder.field("exclude", excludePattern); } return builder; } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java index 01a5cc7bc2f..478309d1bc0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsParser.java @@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser { public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException { TermsParametersParser aggParser = new TermsParametersParser(); ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build(); - IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context); + IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(); aggParser.parse(aggregationName, parser, context, vsParser, incExcParser); List orderElements = aggParser.getOrderElements(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java index 6eff3fcedfc..e653ad853a9 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/support/IncludeExclude.java @@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support; import com.carrotsearch.hppc.LongOpenHashSet; import com.carrotsearch.hppc.LongSet; + import org.apache.lucene.index.RandomAccessOrds; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongBitSet; +import org.apache.lucene.util.NumericUtils; +import org.apache.lucene.util.automaton.Automata; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.automaton.Operations; +import org.apache.lucene.util.automaton.RegExp; +import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.ElasticsearchParseException; -import org.elasticsearch.ExceptionsHelper; -import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.support.ValuesSource; -import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.HashSet; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.SortedSet; +import java.util.TreeSet; /** * Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic, @@ -43,8 +51,8 @@ import java.util.regex.Pattern; */ public class IncludeExclude { - // The includeValue and excludeValue ByteRefs which are the result of the parsing - // process are converted into a LongFilter when used on numeric fields + // The includeValue and excludeValue ByteRefs which are the result of the parsing + // process are converted into a LongFilter when used on numeric fields // in the index. public static class LongFilter { private LongSet valids; @@ -72,152 +80,145 @@ public class IncludeExclude { } } - private final Matcher include; - private final Matcher exclude; - private final CharsRefBuilder scratch = new CharsRefBuilder(); - private Set includeValues; - private Set excludeValues; - private final boolean hasRegexTest; + // Only used for the 'map' execution mode (ie. scripts) + public static class StringFilter { + + private final ByteRunAutomaton runAutomaton; + + private StringFilter(Automaton automaton) { + this.runAutomaton = new ByteRunAutomaton(automaton); + } + + /** + * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. + */ + public boolean accept(BytesRef value) { + return runAutomaton.run(value.bytes, value.offset, value.length); + } + } + + public static class OrdinalsFilter { + + private final CompiledAutomaton compiled; + + private OrdinalsFilter(Automaton automaton) { + this.compiled = new CompiledAutomaton(automaton); + } + + /** + * Computes which global ordinals are accepted by this IncludeExclude instance. + */ + public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException { + LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); + TermsEnum globalTermsEnum; + Terms globalTerms = new DocValuesTerms(globalOrdinals); + // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits. + globalTermsEnum = compiled.getTermsEnum(globalTerms); + for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { + acceptedGlobalOrdinals.set(globalTermsEnum.ord()); + } + return acceptedGlobalOrdinals; + } + + } + + private final RegExp include, exclude; + private final SortedSet includeValues, excludeValues; /** * @param include The regular expression pattern for the terms to be included - * (may only be {@code null} if one of the other arguments is none-null. - * @param includeValues The terms to be included - * (may only be {@code null} if one of the other arguments is none-null. * @param exclude The regular expression pattern for the terms to be excluded - * (may only be {@code null} if one of the other arguments is none-null. - * @param excludeValues The terms to be excluded - * (may only be {@code null} if one of the other arguments is none-null. */ - public IncludeExclude(Pattern include, Pattern exclude, Set includeValues, Set excludeValues) { - assert includeValues != null || include != null || - exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null - this.include = include != null ? include.matcher("") : null; - this.exclude = exclude != null ? exclude.matcher("") : null; - hasRegexTest = include != null || exclude != null; + public IncludeExclude(RegExp include, RegExp exclude) { + if (include == null && exclude == null) { + throw new IllegalArgumentException(); + } + this.include = include; + this.exclude = exclude; + this.includeValues = null; + this.excludeValues = null; + } + + /** + * @param includeValues The terms to be included + * @param excludeValues The terms to be excluded + */ + public IncludeExclude(SortedSet includeValues, SortedSet excludeValues) { + if (includeValues == null && excludeValues == null) { + throw new IllegalArgumentException(); + } + this.include = null; + this.exclude = null; this.includeValues = includeValues; this.excludeValues = excludeValues; } /** - * Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns. + * Terms adapter around doc values. */ - public boolean accept(BytesRef value) { + private static class DocValuesTerms extends Terms { - if (hasRegexTest) { - // We need to perform UTF8 to UTF16 conversion for use in the regex matching - scratch.copyUTF8Bytes(value); - } - return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get()); - } - - private boolean isIncluded(BytesRef value, CharsRef utf16Chars) { + private final SortedSetDocValues values; - if ((includeValues == null) && (include == null)) { - // No include criteria to be tested. - return true; + DocValuesTerms(SortedSetDocValues values) { + this.values = values; } - - if (include != null) { - if (include.reset(scratch.get()).matches()) { - return true; - } + + @Override + public TermsEnum iterator(TermsEnum reuse) throws IOException { + return values.termsEnum(); } - if (includeValues != null) { - if (includeValues.contains(value)) { - return true; - } + + @Override + public long size() throws IOException { + return -1; } - // Some include criteria was tested but no match found - return false; - } - - private boolean isExcluded(BytesRef value, CharsRef utf16Chars) { - if (exclude != null) { - if (exclude.reset(scratch.get()).matches()) { - return true; - } + + @Override + public long getSumTotalTermFreq() throws IOException { + return -1; } - if (excludeValues != null) { - if (excludeValues.contains(value)) { - return true; - } + + @Override + public long getSumDocFreq() throws IOException { + return -1; } - // No exclude criteria was tested or no match found - return false; + + @Override + public int getDocCount() throws IOException { + return -1; + } + + @Override + public boolean hasFreqs() { + return false; + } + + @Override + public boolean hasOffsets() { + return false; + } + + @Override + public boolean hasPositions() { + return false; + } + + @Override + public boolean hasPayloads() { + return false; + } + } - /** - * Computes which global ordinals are accepted by this IncludeExclude instance. - */ - public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) { - LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount()); - // There are 3 ways of populating this bitset: - // 1) Looking up the global ordinals for known "include" terms - // 2) Looking up the global ordinals for known "exclude" terms - // 3) Traversing the term enum for all terms and running past regexes - // Option 3 is known to be very slow in the case of high-cardinality fields and - // should be avoided if possible. - if (includeValues != null) { - // optimize for the case where the set of accepted values is a set - // of known terms, not a regex that would have to be tested against all terms in the index - for (BytesRef includeValue : includeValues) { - // We need to perform UTF8 to UTF16 conversion for use in the regex matching - scratch.copyUTF8Bytes(includeValue); - if (!isExcluded(includeValue, scratch.get())) { - long ord = globalOrdinals.lookupTerm(includeValue); - if (ord >= 0) { - acceptedGlobalOrdinals.set(ord); - } - } - } - } else { - if(hasRegexTest) { - // We have includeVals that are a regex or only regex excludes - we need to do the potentially - // slow option of hitting termsEnum for every term in the index. - TermsEnum globalTermsEnum = globalOrdinals.termsEnum(); - try { - for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) { - if (accept(term)) { - acceptedGlobalOrdinals.set(globalTermsEnum.ord()); - } - } - } catch (IOException e) { - throw ExceptionsHelper.convertToElastic(e); - } - } else { - // we only have a set of known values to exclude - create a bitset with all good values and negate the known bads - acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length()); - for (BytesRef excludeValue : excludeValues) { - long ord = globalOrdinals.lookupTerm(excludeValue); - if (ord >= 0) { - acceptedGlobalOrdinals.clear(ord); - } - } - - } - } - return acceptedGlobalOrdinals; - } + public static class Parser { - private final String aggName; - private final InternalAggregation.Type aggType; - private final SearchContext context; - String include = null; - int includeFlags = 0; // 0 means no flags String exclude = null; - int excludeFlags = 0; // 0 means no flags - Set includeValues; - Set excludeValues; - - public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) { - this.aggName = aggName; - this.aggType = aggType; - this.context = context; - } + SortedSet includeValues; + SortedSet excludeValues; public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException { @@ -231,14 +232,14 @@ public class IncludeExclude { } return true; } - + if (token == XContentParser.Token.START_ARRAY) { if ("include".equals(currentFieldName)) { - includeValues = parseArrayToSet(parser); + includeValues = new TreeSet<>(parseArrayToSet(parser)); return true; - } + } if ("exclude".equals(currentFieldName)) { - excludeValues = parseArrayToSet(parser); + excludeValues = new TreeSet<>(parseArrayToSet(parser)); return true; } return false; @@ -252,12 +253,6 @@ public class IncludeExclude { } else if (token == XContentParser.Token.VALUE_STRING) { if ("pattern".equals(currentFieldName)) { include = parser.text(); - } else if ("flags".equals(currentFieldName)) { - includeFlags = Regex.flagsFromString(parser.text()); - } - } else if (token == XContentParser.Token.VALUE_NUMBER) { - if ("flags".equals(currentFieldName)) { - includeFlags = parser.intValue(); } } } @@ -268,12 +263,6 @@ public class IncludeExclude { } else if (token == XContentParser.Token.VALUE_STRING) { if ("pattern".equals(currentFieldName)) { exclude = parser.text(); - } else if ("flags".equals(currentFieldName)) { - excludeFlags = Regex.flagsFromString(parser.text()); - } - } else if (token == XContentParser.Token.VALUE_NUMBER) { - if ("flags".equals(currentFieldName)) { - excludeFlags = parser.intValue(); } } } @@ -298,19 +287,50 @@ public class IncludeExclude { } return set; } - + public IncludeExclude includeExclude() { - if (include == null && exclude == null && includeValues == null && excludeValues == null) { + RegExp includePattern = include != null ? new RegExp(include) : null; + RegExp excludePattern = exclude != null ? new RegExp(exclude) : null; + if (includePattern != null || excludePattern != null) { + if (includeValues != null || excludeValues != null) { + throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both"); + } + return new IncludeExclude(includePattern, excludePattern); + } else if (includeValues != null || excludeValues != null) { + return new IncludeExclude(includeValues, excludeValues); + } else { return null; } - Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null; - Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null; - return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues); } } public boolean isRegexBased() { - return hasRegexTest; + return include != null || exclude != null; + } + + private Automaton toAutomaton() { + Automaton a = null; + if (include != null) { + a = include.toAutomaton(); + } else if (includeValues != null) { + a = Automata.makeStringUnion(includeValues); + } else { + a = Automata.makeAnyString(); + } + if (exclude != null) { + a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } else if (excludeValues != null) { + a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES); + } + return a; + } + + public StringFilter convertToStringFilter() { + return new StringFilter(toAutomaton()); + } + + public OrdinalsFilter convertToOrdinalsFilter() { + return new OrdinalsFilter(toAutomaton()); } public LongFilter convertToLongFilter() { @@ -329,6 +349,7 @@ public class IncludeExclude { } return result; } + public LongFilter convertToDoubleFilter() { int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); diff --git a/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java new file mode 100644 index 00000000000..7de95e102a5 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/search/aggregations/IncludeExcludeAggregationSearchBenchmark.java @@ -0,0 +1,130 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.search.aggregations; + +import org.apache.lucene.util.TestUtil; +import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.node.Node; + +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import static org.elasticsearch.client.Requests.createIndexRequest; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; +import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; + +public class IncludeExcludeAggregationSearchBenchmark { + + private static final Random R = new Random(); + private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName(); + private static final int NUM_DOCS = 10000000; + private static final int BATCH = 100; + private static final int WARM = 3; + private static final int RUNS = 10; + private static final int ITERS = 3; + + public static void main(String[] args) { + Settings settings = settingsBuilder() + .put("index.refresh_interval", "-1") + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + .build(); + + Node[] nodes = new Node[1]; + for (int i = 0; i < nodes.length; i++) { + nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "node" + i)) + .node(); + } + + Node clientNode = nodeBuilder() + .clusterName(CLUSTER_NAME) + .settings(settingsBuilder().put(settings).put("name", "client")).client(true).node(); + + Client client = clientNode.client(); + + try { + client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type", + jsonBuilder().startObject().startObject("type").startObject("properties") + .startObject("str") + .field("type", "string") + .field("index", "not_analyzed") + .endObject() + .endObject().endObject().endObject())).actionGet(); + + System.out.println("Indexing " + NUM_DOCS + " documents"); + + StopWatch stopWatch = new StopWatch().start(); + for (int i = 0; i < NUM_DOCS; ) { + BulkRequestBuilder request = client.prepareBulk(); + for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) { + request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R))); + ++i; + } + BulkResponse response = request.execute().actionGet(); + if (response.hasFailures()) { + System.err.println("--> failures..."); + System.err.println(response.buildFailureMessage()); + } + if ((i % 100000) == 0) { + System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime()); + stopWatch.start(); + } + } + + client.admin().indices().prepareRefresh("index").execute().actionGet(); + } catch (Exception e) { + System.out.println("Index already exists, skipping index creation"); + } + + ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet(); + if (clusterHealthResponse.isTimedOut()) { + System.err.println("--> Timed out waiting for cluster health"); + } + + for (int i = 0; i < WARM + RUNS; ++i) { + if (i >= WARM) { + System.out.println("RUN " + (i - WARM)); + } + long start = System.nanoTime(); + SearchResponse resp = null; + for (int j = 0; j < ITERS; ++j) { + resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet(); + } + long end = System.nanoTime(); + if (i >= WARM) { + System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS)); + } + } + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java index 9590222a72b..3ef59e06a90 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java @@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests { } } - @Test - public void singleValueField_WithRegexFiltering_WithFlags() throws Exception { - - // include without exclude - // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009 - // with case insensitive flag on the include regex - - SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE)) - .execute().actionGet(); - - assertSearchResponse(response); - - Terms terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(10)); - - for (int i = 0; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - - // include and exclude - // we should be left with: val002, val003, val004, val005, val006, val007, val008, val009 - // with multi-flag masking on the exclude regex - - response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS)) - .execute().actionGet(); - - assertSearchResponse(response); - - terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(8)); - - for (int i = 2; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - - // exclude without include - // we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009 - // with a "no flag" flag - - response = client().prepareSearch("idx").setTypes("high_card_type") - .addAggregation(terms("terms") - .executionHint(randomExecutionHint()) - .field(SINGLE_VALUED_FIELD_NAME) - .collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0)) - .execute().actionGet(); - - assertSearchResponse(response); - - terms = response.getAggregations().get("terms"); - assertThat(terms, notNullValue()); - assertThat(terms.getName(), equalTo("terms")); - assertThat(terms.getBuckets().size(), equalTo(10)); - - for (int i = 0; i < 10; i++) { - Terms.Bucket bucket = terms.getBucketByKey("val00" + i); - assertThat(bucket, notNullValue()); - assertThat(key(bucket), equalTo("val00" + i)); - assertThat(bucket.getDocCount(), equalTo(1l)); - } - } - - @Test public void singleValueField_WithExactTermFiltering() throws Exception { // include without exclude