Aggregations: Speed up include/exclude in terms aggregations with regexps.

Today we check every regular expression eagerly against every possible term.
This can be very slow if you have lots of unique terms, and even the bottleneck
if your query is selective.

This commit switches to Lucene regular expressions instead of Java (not exactly
the same syntax yet most existing regular expressions should keep working) and
uses the same logic as RegExpQuery to intersect the regular expression with the
terms dictionary. I wrote a quick benchmark (in the PR) to make sure it made
things faster and the same request that took 750ms on master now takes 74ms with
this change.

Close #7526
This commit is contained in:
Adrien Grand 2015-04-03 18:04:22 +02:00
parent 2a844fc457
commit aecd9ac515
14 changed files with 337 additions and 329 deletions

View File

@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like
being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we
now always return dates and bucket keys in UTC.
`include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple
regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore.
=== Terms filter lookup caching
The terms filter lookup mechanism does not support the `cache` option anymore

View File

@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r
values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.
The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
[source,js]
--------------------------------------------------
{
"aggs" : {
"tags" : {
"terms" : {
"field" : "tags",
"include" : {
"pattern" : ".*sport.*",
"flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
},
"exclude" : {
"pattern" : "water_.*",
"flags" : "CANON_EQ|CASE_INSENSITIVE"
}
}
}
}
}
--------------------------------------------------
<1> the flags are concatenated using the `|` character as a separator
The possible flags that can be used are:
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
The syntax is the same as <<regexp-syntax,regexp queries>>.
For matching based on exact values the `include` and `exclude` parameters can simply take an array of
strings that represent the terms as they are found in the index:

View File

@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
private final LongHash bucketOrds;
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData);
bucketOrds = new LongHash(1, aggregationContext.bigArrays());
}

View File

@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);

View File

@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
},
@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource;
IndexSearcher indexSearcher = aggregationContext.searchContext().searcher();
return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
},
@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
};

View File

@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser {
.scriptable(false)
.formattable(true)
.build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context);
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds();

View File

@ -57,7 +57,7 @@ import java.util.Map;
public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator {
protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource;
protected final IncludeExclude includeExclude;
protected final IncludeExclude.OrdinalsFilter includeExclude;
// TODO: cache the acceptedglobalValues per aggregation definition.
// We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution.
@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
this.valuesSource = valuesSource;
this.includeExclude = includeExclude;
@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
private final LongHash bucketOrds;
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext,
Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext,
Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData);
bucketOrds = new LongHash(1, aggregationContext.bigArrays());

View File

@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator {
private final ValuesSource valuesSource;
protected final BytesRefHash bucketOrds;
private final IncludeExclude includeExclude;
private final IncludeExclude.StringFilter includeExclude;
public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
this.valuesSource = valuesSource;

View File

@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override
@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override
@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.aggregations.bucket.terms;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
private Terms.ValueType valueType;
private Terms.Order order;
private String includePattern;
private int includeFlags;
private String excludePattern;
private int excludeFlags;
private String executionHint;
private SubAggCollectionMode collectionMode;
private Boolean showTermDocCountError;
@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
/**
* Define a regular expression that will determine what terms should be aggregated. The regular expression is based
* on the {@link java.util.regex.Pattern} class.
* on the {@link RegExp} class.
*
* @see #include(String, int)
* @see {@link RegExp#RegExp(String)}
*/
public TermsBuilder include(String regex) {
return include(regex, 0);
}
/**
* Define a regular expression that will determine what terms should be aggregated. The regular expression is based
* on the {@link java.util.regex.Pattern} class.
*
* @see java.util.regex.Pattern#compile(String, int)
*/
public TermsBuilder include(String regex, int flags) {
if (includeTerms != null) {
throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
}
this.includePattern = regex;
this.includeFlags = flags;
return this;
}
@ -163,26 +151,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
/**
* Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
* expression is based on the {@link java.util.regex.Pattern} class.
* expression is based on the {@link RegExp} class.
*
* @see #exclude(String, int)
* @see {@link RegExp#RegExp(String)}
*/
public TermsBuilder exclude(String regex) {
return exclude(regex, 0);
}
/**
* Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
* expression is based on the {@link java.util.regex.Pattern} class.
*
* @see java.util.regex.Pattern#compile(String, int)
*/
public TermsBuilder exclude(String regex, int flags) {
if (excludeTerms != null) {
throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both");
}
this.excludePattern = regex;
this.excludeFlags = flags;
return this;
}
@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
builder.array("include", includeTerms);
}
if (includePattern != null) {
if (includeFlags == 0) {
builder.field("include", includePattern);
} else {
builder.startObject("include")
.field("pattern", includePattern)
.field("flags", includeFlags)
.endObject();
}
}
if (excludeTerms != null) {
builder.array("exclude", excludeTerms);
}
if (excludePattern != null) {
if (excludeFlags == 0) {
builder.field("exclude", excludePattern);
} else {
builder.startObject("exclude")
.field("pattern", excludePattern)
.field("flags", excludeFlags)
.endObject();
}
}
return builder;
}

View File

@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser {
public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
TermsParametersParser aggParser = new TermsParametersParser();
ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context);
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
List<OrderElement> orderElements = aggParser.getOrderElements();

View File

@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support;
import com.carrotsearch.hppc.LongOpenHashSet;
import com.carrotsearch.hppc.LongSet;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
@ -72,152 +80,145 @@ public class IncludeExclude {
}
}
private final Matcher include;
private final Matcher exclude;
private final CharsRefBuilder scratch = new CharsRefBuilder();
private Set<BytesRef> includeValues;
private Set<BytesRef> excludeValues;
private final boolean hasRegexTest;
// Only used for the 'map' execution mode (ie. scripts)
public static class StringFilter {
/**
* @param include The regular expression pattern for the terms to be included
* (may only be {@code null} if one of the other arguments is none-null.
* @param includeValues The terms to be included
* (may only be {@code null} if one of the other arguments is none-null.
* @param exclude The regular expression pattern for the terms to be excluded
* (may only be {@code null} if one of the other arguments is none-null.
* @param excludeValues The terms to be excluded
* (may only be {@code null} if one of the other arguments is none-null.
*/
public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
assert includeValues != null || include != null ||
exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
this.include = include != null ? include.matcher("") : null;
this.exclude = exclude != null ? exclude.matcher("") : null;
hasRegexTest = include != null || exclude != null;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
private final ByteRunAutomaton runAutomaton;
private StringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}
/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
public boolean accept(BytesRef value) {
if (hasRegexTest) {
// We need to perform UTF8 to UTF16 conversion for use in the regex matching
scratch.copyUTF8Bytes(value);
return runAutomaton.run(value.bytes, value.offset, value.length);
}
return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
}
private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {
public static class OrdinalsFilter {
if ((includeValues == null) && (include == null)) {
// No include criteria to be tested.
return true;
}
private final CompiledAutomaton compiled;
if (include != null) {
if (include.reset(scratch.get()).matches()) {
return true;
}
}
if (includeValues != null) {
if (includeValues.contains(value)) {
return true;
}
}
// Some include criteria was tested but no match found
return false;
}
private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
if (exclude != null) {
if (exclude.reset(scratch.get()).matches()) {
return true;
}
}
if (excludeValues != null) {
if (excludeValues.contains(value)) {
return true;
}
}
// No exclude criteria was tested or no match found
return false;
private OrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
}
/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*/
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
// There are 3 ways of populating this bitset:
// 1) Looking up the global ordinals for known "include" terms
// 2) Looking up the global ordinals for known "exclude" terms
// 3) Traversing the term enum for all terms and running past regexes
// Option 3 is known to be very slow in the case of high-cardinality fields and
// should be avoided if possible.
if (includeValues != null) {
// optimize for the case where the set of accepted values is a set
// of known terms, not a regex that would have to be tested against all terms in the index
for (BytesRef includeValue : includeValues) {
// We need to perform UTF8 to UTF16 conversion for use in the regex matching
scratch.copyUTF8Bytes(includeValue);
if (!isExcluded(includeValue, scratch.get())) {
long ord = globalOrdinals.lookupTerm(includeValue);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
}
} else {
if(hasRegexTest) {
// We have includeVals that are a regex or only regex excludes - we need to do the potentially
// slow option of hitting termsEnum for every term in the index.
TermsEnum globalTermsEnum = globalOrdinals.termsEnum();
try {
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
if (accept(term)) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
}
} catch (IOException e) {
throw ExceptionsHelper.convertToElastic(e);
}
} else {
// we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
for (BytesRef excludeValue : excludeValues) {
long ord = globalOrdinals.lookupTerm(excludeValue);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
}
}
}
}
return acceptedGlobalOrdinals;
}
}
private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
/**
* @param include The regular expression pattern for the terms to be included
* @param exclude The regular expression pattern for the terms to be excluded
*/
public IncludeExclude(RegExp include, RegExp exclude) {
if (include == null && exclude == null) {
throw new IllegalArgumentException();
}
this.include = include;
this.exclude = exclude;
this.includeValues = null;
this.excludeValues = null;
}
/**
* @param includeValues The terms to be included
* @param excludeValues The terms to be excluded
*/
public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
this.include = null;
this.exclude = null;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}
/**
* Terms adapter around doc values.
*/
private static class DocValuesTerms extends Terms {
private final SortedSetDocValues values;
DocValuesTerms(SortedSetDocValues values) {
this.values = values;
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return values.termsEnum();
}
@Override
public long size() throws IOException {
return -1;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return -1;
}
@Override
public long getSumDocFreq() throws IOException {
return -1;
}
@Override
public int getDocCount() throws IOException {
return -1;
}
@Override
public boolean hasFreqs() {
return false;
}
@Override
public boolean hasOffsets() {
return false;
}
@Override
public boolean hasPositions() {
return false;
}
@Override
public boolean hasPayloads() {
return false;
}
}
public static class Parser {
private final String aggName;
private final InternalAggregation.Type aggType;
private final SearchContext context;
String include = null;
int includeFlags = 0; // 0 means no flags
String exclude = null;
int excludeFlags = 0; // 0 means no flags
Set<BytesRef> includeValues;
Set<BytesRef> excludeValues;
public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
this.aggName = aggName;
this.aggType = aggType;
this.context = context;
}
SortedSet<BytesRef> includeValues;
SortedSet<BytesRef> excludeValues;
public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException {
@ -234,11 +235,11 @@ public class IncludeExclude {
if (token == XContentParser.Token.START_ARRAY) {
if ("include".equals(currentFieldName)) {
includeValues = parseArrayToSet(parser);
includeValues = new TreeSet<>(parseArrayToSet(parser));
return true;
}
if ("exclude".equals(currentFieldName)) {
excludeValues = parseArrayToSet(parser);
excludeValues = new TreeSet<>(parseArrayToSet(parser));
return true;
}
return false;
@ -252,12 +253,6 @@ public class IncludeExclude {
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("pattern".equals(currentFieldName)) {
include = parser.text();
} else if ("flags".equals(currentFieldName)) {
includeFlags = Regex.flagsFromString(parser.text());
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("flags".equals(currentFieldName)) {
includeFlags = parser.intValue();
}
}
}
@ -268,12 +263,6 @@ public class IncludeExclude {
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("pattern".equals(currentFieldName)) {
exclude = parser.text();
} else if ("flags".equals(currentFieldName)) {
excludeFlags = Regex.flagsFromString(parser.text());
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("flags".equals(currentFieldName)) {
excludeFlags = parser.intValue();
}
}
}
@ -300,17 +289,48 @@ public class IncludeExclude {
}
public IncludeExclude includeExclude() {
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
RegExp includePattern = include != null ? new RegExp(include) : null;
RegExp excludePattern = exclude != null ? new RegExp(exclude) : null;
if (includePattern != null || excludePattern != null) {
if (includeValues != null || excludeValues != null) {
throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both");
}
return new IncludeExclude(includePattern, excludePattern);
} else if (includeValues != null || excludeValues != null) {
return new IncludeExclude(includeValues, excludeValues);
} else {
return null;
}
Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null;
Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
}
}
public boolean isRegexBased() {
return hasRegexTest;
return include != null || exclude != null;
}
private Automaton toAutomaton() {
Automaton a = null;
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}
public StringFilter convertToStringFilter() {
return new StringFilter(toAutomaton());
}
public OrdinalsFilter convertToOrdinalsFilter() {
return new OrdinalsFilter(toAutomaton());
}
public LongFilter convertToLongFilter() {
@ -329,6 +349,7 @@ public class IncludeExclude {
}
return result;
}
public LongFilter convertToDoubleFilter() {
int numValids = includeValues == null ? 0 : includeValues.size();
int numInvalids = excludeValues == null ? 0 : excludeValues.size();

View File

@ -0,0 +1,130 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.benchmark.search.aggregations;
import org.apache.lucene.util.TestUtil;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.node.Node;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import static org.elasticsearch.client.Requests.createIndexRequest;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
public class IncludeExcludeAggregationSearchBenchmark {
private static final Random R = new Random();
private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName();
private static final int NUM_DOCS = 10000000;
private static final int BATCH = 100;
private static final int WARM = 3;
private static final int RUNS = 10;
private static final int ITERS = 3;
public static void main(String[] args) {
Settings settings = settingsBuilder()
.put("index.refresh_interval", "-1")
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)
.build();
Node[] nodes = new Node[1];
for (int i = 0; i < nodes.length; i++) {
nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME)
.settings(settingsBuilder().put(settings).put("name", "node" + i))
.node();
}
Node clientNode = nodeBuilder()
.clusterName(CLUSTER_NAME)
.settings(settingsBuilder().put(settings).put("name", "client")).client(true).node();
Client client = clientNode.client();
try {
client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type",
jsonBuilder().startObject().startObject("type").startObject("properties")
.startObject("str")
.field("type", "string")
.field("index", "not_analyzed")
.endObject()
.endObject().endObject().endObject())).actionGet();
System.out.println("Indexing " + NUM_DOCS + " documents");
StopWatch stopWatch = new StopWatch().start();
for (int i = 0; i < NUM_DOCS; ) {
BulkRequestBuilder request = client.prepareBulk();
for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) {
request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R)));
++i;
}
BulkResponse response = request.execute().actionGet();
if (response.hasFailures()) {
System.err.println("--> failures...");
System.err.println(response.buildFailureMessage());
}
if ((i % 100000) == 0) {
System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime());
stopWatch.start();
}
}
client.admin().indices().prepareRefresh("index").execute().actionGet();
} catch (Exception e) {
System.out.println("Index already exists, skipping index creation");
}
ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet();
if (clusterHealthResponse.isTimedOut()) {
System.err.println("--> Timed out waiting for cluster health");
}
for (int i = 0; i < WARM + RUNS; ++i) {
if (i >= WARM) {
System.out.println("RUN " + (i - WARM));
}
long start = System.nanoTime();
SearchResponse resp = null;
for (int j = 0; j < ITERS; ++j) {
resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet();
}
long end = System.nanoTime();
if (i >= WARM) {
System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS));
}
}
}
}

View File

@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests {
}
}
@Test
public void singleValueField_WithRegexFiltering_WithFlags() throws Exception {
// include without exclude
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
// with case insensitive flag on the include regex
SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE))
.execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
// include and exclude
// we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
// with multi-flag masking on the exclude regex
response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS))
.execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(8));
for (int i = 2; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
// exclude without include
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
// with a "no flag" flag
response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0))
.execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
}
@Test
public void singleValueField_WithExactTermFiltering() throws Exception {
// include without exclude