Merge pull request #10418 from jpountz/enhancement/speed_up_aggs_include_exclude

Aggregations: Speed up include/exclude in terms aggregations with regexps.

Close #10418
This commit is contained in:
Adrien Grand 2015-04-09 12:16:37 +02:00
commit e25db222ee
14 changed files with 337 additions and 329 deletions

View File

@ -139,6 +139,9 @@ equivalent to the former `pre_zone` option. Setting `time_zone` to a value like
being applied in the specified time zone but In addition to this, also the `pre_zone_adjust_large_interval` is removed because we
now always return dates and bucket keys in UTC.
`include`/`exclude` filtering on the `terms` aggregation now uses the same syntax as regexp queries instead of the Java syntax. While simple
regexps should still work, more complex ones might need some rewriting. Also, the `flags` parameter is not supported anymore.
=== Terms filter lookup caching
The terms filter lookup mechanism does not support the `cache` option anymore

View File

@ -482,42 +482,7 @@ with `water_` (so the tag `water_sports` will no be aggregated). The `include` r
values are "allowed" to be aggregated, while the `exclude` determines the values that should not be aggregated. When
both are defined, the `exclude` has precedence, meaning, the `include` is evaluated first and only then the `exclude`.
The regular expression are based on the Java(TM) http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html[Pattern],
and as such, they it is also possible to pass in flags that will determine how the compiled regular expression will work:
[source,js]
--------------------------------------------------
{
"aggs" : {
"tags" : {
"terms" : {
"field" : "tags",
"include" : {
"pattern" : ".*sport.*",
"flags" : "CANON_EQ|CASE_INSENSITIVE" <1>
},
"exclude" : {
"pattern" : "water_.*",
"flags" : "CANON_EQ|CASE_INSENSITIVE"
}
}
}
}
}
--------------------------------------------------
<1> the flags are concatenated using the `|` character as a separator
The possible flags that can be used are:
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CANON_EQ[`CANON_EQ`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#CASE_INSENSITIVE[`CASE_INSENSITIVE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#COMMENTS[`COMMENTS`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#DOTALL[`DOTALL`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#LITERAL[`LITERAL`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#MULTILINE[`MULTILINE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CASE[`UNICODE_CASE`],
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNICODE_CHARACTER_CLASS[`UNICODE_CHARACTER_CLASS`] and
http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#UNIX_LINES[`UNIX_LINES`]
The syntax is the same as <<regexp-syntax,regexp queries>>.
For matching based on exact values the `include` and `exclude` parameters can simply take an array of
strings that represent the terms as they are found in the index:

View File

@ -48,7 +48,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
public GlobalOrdinalsSignificantTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);
@ -145,7 +145,7 @@ public class GlobalOrdinalsSignificantTermsAggregator extends GlobalOrdinalsStri
private final LongHash bucketOrds;
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggFactory, metaData);
bucketOrds = new LongHash(1, aggregationContext.bigArrays());
}

View File

@ -47,7 +47,7 @@ public class SignificantStringTermsAggregator extends StringTermsAggregator {
public SignificantStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent,
IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent,
SignificantTermsAggregatorFactory termsAggFactory, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, null, bucketCountThresholds, includeExclude, aggregationContext, parent, SubAggCollectionMode.DEPTH_FIRST, false, metaData);

View File

@ -65,7 +65,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
return new SignificantStringTermsAggregator(name, factories, valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
},
@ -77,7 +78,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
ValuesSource.Bytes.WithOrdinals valueSourceWithOrdinals = (ValuesSource.Bytes.WithOrdinals) valuesSource;
IndexSearcher indexSearcher = aggregationContext.searchContext().searcher();
return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsSignificantTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
},
@ -87,7 +89,8 @@ public class SignificantTermsAggregatorFactory extends ValuesSourceAggregatorFac
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SignificantTermsAggregatorFactory termsAggregatorFactory, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, includeExclude, aggregationContext, parent, termsAggregatorFactory, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsSignificantTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, bucketCountThresholds, filter, aggregationContext, parent, termsAggregatorFactory, metaData);
}
};

View File

@ -57,7 +57,7 @@ public class SignificantTermsParser implements Aggregator.Parser {
.scriptable(false)
.formattable(true)
.build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, SignificantStringTerms.TYPE, context);
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
TermsAggregator.BucketCountThresholds bucketCountThresholds = aggParser.getBucketCountThresholds();

View File

@ -57,7 +57,7 @@ import java.util.Map;
public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator {
protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource;
protected final IncludeExclude includeExclude;
protected final IncludeExclude.OrdinalsFilter includeExclude;
// TODO: cache the acceptedglobalValues per aggregation definition.
// We can't cache this yet in ValuesSource, since ValuesSource is reused per field for aggs during the execution.
@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
this.valuesSource = valuesSource;
this.includeExclude = includeExclude;
@ -260,7 +260,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
private final LongHash bucketOrds;
public WithHash(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext,
Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext,
Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, collectionMode, showTermDocCountError, metaData);
bucketOrds = new LongHash(1, aggregationContext.bigArrays());

View File

@ -45,11 +45,11 @@ public class StringTermsAggregator extends AbstractStringTermsAggregator {
private final ValuesSource valuesSource;
protected final BytesRefHash bucketOrds;
private final IncludeExclude includeExclude;
private final IncludeExclude.StringFilter includeExclude;
public StringTermsAggregator(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, BucketCountThresholds bucketCountThresholds,
IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
IncludeExclude.StringFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, metaData);
this.valuesSource = valuesSource;

View File

@ -50,7 +50,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.StringFilter filter = includeExclude == null ? null : includeExclude.convertToStringFilter();
return new StringTermsAggregator(name, factories, valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override
@ -65,7 +66,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override
@ -80,7 +82,8 @@ public class TermsAggregatorFactory extends ValuesSourceAggregatorFactory<Values
Aggregator create(String name, AggregatorFactories factories, ValuesSource valuesSource,
Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude,
AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, Map<String, Object> metaData) throws IOException {
return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter();
return new GlobalOrdinalsStringTermsAggregator.WithHash(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, metaData);
}
@Override

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.aggregations.bucket.terms;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode;
@ -37,9 +38,7 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
private Terms.ValueType valueType;
private Terms.Order order;
private String includePattern;
private int includeFlags;
private String excludePattern;
private int excludeFlags;
private String executionHint;
private SubAggCollectionMode collectionMode;
private Boolean showTermDocCountError;
@ -88,26 +87,15 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
/**
* Define a regular expression that will determine what terms should be aggregated. The regular expression is based
* on the {@link java.util.regex.Pattern} class.
* on the {@link RegExp} class.
*
* @see #include(String, int)
* @see {@link RegExp#RegExp(String)}
*/
public TermsBuilder include(String regex) {
return include(regex, 0);
}
/**
* Define a regular expression that will determine what terms should be aggregated. The regular expression is based
* on the {@link java.util.regex.Pattern} class.
*
* @see java.util.regex.Pattern#compile(String, int)
*/
public TermsBuilder include(String regex, int flags) {
if (includeTerms != null) {
throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of strings or a regex, not both");
}
this.includePattern = regex;
this.includeFlags = flags;
return this;
}
@ -160,29 +148,18 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
}
return termsAsString;
}
/**
* Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
* expression is based on the {@link java.util.regex.Pattern} class.
*
* @see #exclude(String, int)
*/
public TermsBuilder exclude(String regex) {
return exclude(regex, 0);
}
/**
* Define a regular expression that will filter out terms that should be excluded from the aggregation. The regular
* expression is based on the {@link java.util.regex.Pattern} class.
* expression is based on the {@link RegExp} class.
*
* @see java.util.regex.Pattern#compile(String, int)
* @see {@link RegExp#RegExp(String)}
*/
public TermsBuilder exclude(String regex, int flags) {
public TermsBuilder exclude(String regex) {
if (excludeTerms != null) {
throw new ElasticsearchIllegalArgumentException("exclude clause must be an array of exact values or a regex, not both");
}
this.excludePattern = regex;
this.excludeFlags = flags;
return this;
}
@ -287,27 +264,13 @@ public class TermsBuilder extends ValuesSourceAggregationBuilder<TermsBuilder> {
builder.array("include", includeTerms);
}
if (includePattern != null) {
if (includeFlags == 0) {
builder.field("include", includePattern);
} else {
builder.startObject("include")
.field("pattern", includePattern)
.field("flags", includeFlags)
.endObject();
}
builder.field("include", includePattern);
}
if (excludeTerms != null) {
builder.array("exclude", excludeTerms);
}
if (excludePattern != null) {
if (excludeFlags == 0) {
builder.field("exclude", excludePattern);
} else {
builder.startObject("exclude")
.field("pattern", excludePattern)
.field("flags", excludeFlags)
.endObject();
}
builder.field("exclude", excludePattern);
}
return builder;
}

View File

@ -46,7 +46,7 @@ public class TermsParser implements Aggregator.Parser {
public AggregatorFactory parse(String aggregationName, XContentParser parser, SearchContext context) throws IOException {
TermsParametersParser aggParser = new TermsParametersParser();
ValuesSourceParser vsParser = ValuesSourceParser.any(aggregationName, StringTerms.TYPE, context).scriptable(true).formattable(true).build();
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser(aggregationName, StringTerms.TYPE, context);
IncludeExclude.Parser incExcParser = new IncludeExclude.Parser();
aggParser.parse(aggregationName, parser, context, vsParser, incExcParser);
List<OrderElement> orderElements = aggParser.getOrderElements();

View File

@ -20,22 +20,30 @@ package org.elasticsearch.search.aggregations.bucket.terms.support;
import com.carrotsearch.hppc.LongOpenHashSet;
import com.carrotsearch.hppc.LongSet;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.search.aggregations.InternalAggregation;
import org.elasticsearch.search.aggregations.support.ValuesSource;
import org.elasticsearch.search.internal.SearchContext;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.SortedSet;
import java.util.TreeSet;
/**
* Defines the include/exclude regular expression filtering for string terms aggregation. In this filtering logic,
@ -43,8 +51,8 @@ import java.util.regex.Pattern;
*/
public class IncludeExclude {
// The includeValue and excludeValue ByteRefs which are the result of the parsing
// process are converted into a LongFilter when used on numeric fields
// The includeValue and excludeValue ByteRefs which are the result of the parsing
// process are converted into a LongFilter when used on numeric fields
// in the index.
public static class LongFilter {
private LongSet valids;
@ -72,152 +80,145 @@ public class IncludeExclude {
}
}
private final Matcher include;
private final Matcher exclude;
private final CharsRefBuilder scratch = new CharsRefBuilder();
private Set<BytesRef> includeValues;
private Set<BytesRef> excludeValues;
private final boolean hasRegexTest;
// Only used for the 'map' execution mode (ie. scripts)
public static class StringFilter {
private final ByteRunAutomaton runAutomaton;
private StringFilter(Automaton automaton) {
this.runAutomaton = new ByteRunAutomaton(automaton);
}
/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
*/
public boolean accept(BytesRef value) {
return runAutomaton.run(value.bytes, value.offset, value.length);
}
}
public static class OrdinalsFilter {
private final CompiledAutomaton compiled;
private OrdinalsFilter(Automaton automaton) {
this.compiled = new CompiledAutomaton(automaton);
}
/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*/
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) throws IOException {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
TermsEnum globalTermsEnum;
Terms globalTerms = new DocValuesTerms(globalOrdinals);
// TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can avoid i/o and just set bits.
globalTermsEnum = compiled.getTermsEnum(globalTerms);
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
return acceptedGlobalOrdinals;
}
}
private final RegExp include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
/**
* @param include The regular expression pattern for the terms to be included
* (may only be {@code null} if one of the other arguments is none-null.
* @param includeValues The terms to be included
* (may only be {@code null} if one of the other arguments is none-null.
* @param exclude The regular expression pattern for the terms to be excluded
* (may only be {@code null} if one of the other arguments is none-null.
* @param excludeValues The terms to be excluded
* (may only be {@code null} if one of the other arguments is none-null.
*/
public IncludeExclude(Pattern include, Pattern exclude, Set<BytesRef> includeValues, Set<BytesRef> excludeValues) {
assert includeValues != null || include != null ||
exclude != null || excludeValues != null : "includes & excludes cannot both be null"; // otherwise IncludeExclude object should be null
this.include = include != null ? include.matcher("") : null;
this.exclude = exclude != null ? exclude.matcher("") : null;
hasRegexTest = include != null || exclude != null;
public IncludeExclude(RegExp include, RegExp exclude) {
if (include == null && exclude == null) {
throw new IllegalArgumentException();
}
this.include = include;
this.exclude = exclude;
this.includeValues = null;
this.excludeValues = null;
}
/**
* @param includeValues The terms to be included
* @param excludeValues The terms to be excluded
*/
public IncludeExclude(SortedSet<BytesRef> includeValues, SortedSet<BytesRef> excludeValues) {
if (includeValues == null && excludeValues == null) {
throw new IllegalArgumentException();
}
this.include = null;
this.exclude = null;
this.includeValues = includeValues;
this.excludeValues = excludeValues;
}
/**
* Returns whether the given value is accepted based on the {@code include} & {@code exclude} patterns.
* Terms adapter around doc values.
*/
public boolean accept(BytesRef value) {
private static class DocValuesTerms extends Terms {
if (hasRegexTest) {
// We need to perform UTF8 to UTF16 conversion for use in the regex matching
scratch.copyUTF8Bytes(value);
}
return isIncluded(value, scratch.get()) && !isExcluded(value, scratch.get());
}
private boolean isIncluded(BytesRef value, CharsRef utf16Chars) {
private final SortedSetDocValues values;
if ((includeValues == null) && (include == null)) {
// No include criteria to be tested.
return true;
DocValuesTerms(SortedSetDocValues values) {
this.values = values;
}
if (include != null) {
if (include.reset(scratch.get()).matches()) {
return true;
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return values.termsEnum();
}
if (includeValues != null) {
if (includeValues.contains(value)) {
return true;
}
@Override
public long size() throws IOException {
return -1;
}
// Some include criteria was tested but no match found
return false;
}
private boolean isExcluded(BytesRef value, CharsRef utf16Chars) {
if (exclude != null) {
if (exclude.reset(scratch.get()).matches()) {
return true;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return -1;
}
if (excludeValues != null) {
if (excludeValues.contains(value)) {
return true;
}
@Override
public long getSumDocFreq() throws IOException {
return -1;
}
// No exclude criteria was tested or no match found
return false;
@Override
public int getDocCount() throws IOException {
return -1;
}
@Override
public boolean hasFreqs() {
return false;
}
@Override
public boolean hasOffsets() {
return false;
}
@Override
public boolean hasPositions() {
return false;
}
@Override
public boolean hasPayloads() {
return false;
}
}
/**
* Computes which global ordinals are accepted by this IncludeExclude instance.
*/
public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals, ValuesSource.Bytes.WithOrdinals valueSource) {
LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
// There are 3 ways of populating this bitset:
// 1) Looking up the global ordinals for known "include" terms
// 2) Looking up the global ordinals for known "exclude" terms
// 3) Traversing the term enum for all terms and running past regexes
// Option 3 is known to be very slow in the case of high-cardinality fields and
// should be avoided if possible.
if (includeValues != null) {
// optimize for the case where the set of accepted values is a set
// of known terms, not a regex that would have to be tested against all terms in the index
for (BytesRef includeValue : includeValues) {
// We need to perform UTF8 to UTF16 conversion for use in the regex matching
scratch.copyUTF8Bytes(includeValue);
if (!isExcluded(includeValue, scratch.get())) {
long ord = globalOrdinals.lookupTerm(includeValue);
if (ord >= 0) {
acceptedGlobalOrdinals.set(ord);
}
}
}
} else {
if(hasRegexTest) {
// We have includeVals that are a regex or only regex excludes - we need to do the potentially
// slow option of hitting termsEnum for every term in the index.
TermsEnum globalTermsEnum = globalOrdinals.termsEnum();
try {
for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
if (accept(term)) {
acceptedGlobalOrdinals.set(globalTermsEnum.ord());
}
}
} catch (IOException e) {
throw ExceptionsHelper.convertToElastic(e);
}
} else {
// we only have a set of known values to exclude - create a bitset with all good values and negate the known bads
acceptedGlobalOrdinals.set(0, acceptedGlobalOrdinals.length());
for (BytesRef excludeValue : excludeValues) {
long ord = globalOrdinals.lookupTerm(excludeValue);
if (ord >= 0) {
acceptedGlobalOrdinals.clear(ord);
}
}
}
}
return acceptedGlobalOrdinals;
}
public static class Parser {
private final String aggName;
private final InternalAggregation.Type aggType;
private final SearchContext context;
String include = null;
int includeFlags = 0; // 0 means no flags
String exclude = null;
int excludeFlags = 0; // 0 means no flags
Set<BytesRef> includeValues;
Set<BytesRef> excludeValues;
public Parser(String aggName, InternalAggregation.Type aggType, SearchContext context) {
this.aggName = aggName;
this.aggType = aggType;
this.context = context;
}
SortedSet<BytesRef> includeValues;
SortedSet<BytesRef> excludeValues;
public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException {
@ -231,14 +232,14 @@ public class IncludeExclude {
}
return true;
}
if (token == XContentParser.Token.START_ARRAY) {
if ("include".equals(currentFieldName)) {
includeValues = parseArrayToSet(parser);
includeValues = new TreeSet<>(parseArrayToSet(parser));
return true;
}
}
if ("exclude".equals(currentFieldName)) {
excludeValues = parseArrayToSet(parser);
excludeValues = new TreeSet<>(parseArrayToSet(parser));
return true;
}
return false;
@ -252,12 +253,6 @@ public class IncludeExclude {
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("pattern".equals(currentFieldName)) {
include = parser.text();
} else if ("flags".equals(currentFieldName)) {
includeFlags = Regex.flagsFromString(parser.text());
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("flags".equals(currentFieldName)) {
includeFlags = parser.intValue();
}
}
}
@ -268,12 +263,6 @@ public class IncludeExclude {
} else if (token == XContentParser.Token.VALUE_STRING) {
if ("pattern".equals(currentFieldName)) {
exclude = parser.text();
} else if ("flags".equals(currentFieldName)) {
excludeFlags = Regex.flagsFromString(parser.text());
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("flags".equals(currentFieldName)) {
excludeFlags = parser.intValue();
}
}
}
@ -298,19 +287,50 @@ public class IncludeExclude {
}
return set;
}
public IncludeExclude includeExclude() {
if (include == null && exclude == null && includeValues == null && excludeValues == null) {
RegExp includePattern = include != null ? new RegExp(include) : null;
RegExp excludePattern = exclude != null ? new RegExp(exclude) : null;
if (includePattern != null || excludePattern != null) {
if (includeValues != null || excludeValues != null) {
throw new ElasticsearchIllegalArgumentException("Can only use regular expression include/exclude or a set of values, not both");
}
return new IncludeExclude(includePattern, excludePattern);
} else if (includeValues != null || excludeValues != null) {
return new IncludeExclude(includeValues, excludeValues);
} else {
return null;
}
Pattern includePattern = include != null ? Pattern.compile(include, includeFlags) : null;
Pattern excludePattern = exclude != null ? Pattern.compile(exclude, excludeFlags) : null;
return new IncludeExclude(includePattern, excludePattern, includeValues, excludeValues);
}
}
public boolean isRegexBased() {
return hasRegexTest;
return include != null || exclude != null;
}
private Automaton toAutomaton() {
Automaton a = null;
if (include != null) {
a = include.toAutomaton();
} else if (includeValues != null) {
a = Automata.makeStringUnion(includeValues);
} else {
a = Automata.makeAnyString();
}
if (exclude != null) {
a = Operations.minus(a, exclude.toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
} else if (excludeValues != null) {
a = Operations.minus(a, Automata.makeStringUnion(excludeValues), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
}
return a;
}
public StringFilter convertToStringFilter() {
return new StringFilter(toAutomaton());
}
public OrdinalsFilter convertToOrdinalsFilter() {
return new OrdinalsFilter(toAutomaton());
}
public LongFilter convertToLongFilter() {
@ -329,6 +349,7 @@ public class IncludeExclude {
}
return result;
}
public LongFilter convertToDoubleFilter() {
int numValids = includeValues == null ? 0 : includeValues.size();
int numInvalids = excludeValues == null ? 0 : excludeValues.size();

View File

@ -0,0 +1,130 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.benchmark.search.aggregations;
import org.apache.lucene.util.TestUtil;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.node.Node;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import static org.elasticsearch.client.Requests.createIndexRequest;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
public class IncludeExcludeAggregationSearchBenchmark {
private static final Random R = new Random();
private static final String CLUSTER_NAME = IncludeExcludeAggregationSearchBenchmark.class.getSimpleName();
private static final int NUM_DOCS = 10000000;
private static final int BATCH = 100;
private static final int WARM = 3;
private static final int RUNS = 10;
private static final int ITERS = 3;
public static void main(String[] args) {
Settings settings = settingsBuilder()
.put("index.refresh_interval", "-1")
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)
.build();
Node[] nodes = new Node[1];
for (int i = 0; i < nodes.length; i++) {
nodes[i] = nodeBuilder().clusterName(CLUSTER_NAME)
.settings(settingsBuilder().put(settings).put("name", "node" + i))
.node();
}
Node clientNode = nodeBuilder()
.clusterName(CLUSTER_NAME)
.settings(settingsBuilder().put(settings).put("name", "client")).client(true).node();
Client client = clientNode.client();
try {
client.admin().indices().create(createIndexRequest("index").settings(settings).mapping("type",
jsonBuilder().startObject().startObject("type").startObject("properties")
.startObject("str")
.field("type", "string")
.field("index", "not_analyzed")
.endObject()
.endObject().endObject().endObject())).actionGet();
System.out.println("Indexing " + NUM_DOCS + " documents");
StopWatch stopWatch = new StopWatch().start();
for (int i = 0; i < NUM_DOCS; ) {
BulkRequestBuilder request = client.prepareBulk();
for (int j = 0; j < BATCH && i < NUM_DOCS; ++j) {
request.add(client.prepareIndex("index", "type", Integer.toString(i)).setSource("str", TestUtil.randomSimpleString(R)));
++i;
}
BulkResponse response = request.execute().actionGet();
if (response.hasFailures()) {
System.err.println("--> failures...");
System.err.println(response.buildFailureMessage());
}
if ((i % 100000) == 0) {
System.out.println("--> Indexed " + i + " took " + stopWatch.stop().lastTaskTime());
stopWatch.start();
}
}
client.admin().indices().prepareRefresh("index").execute().actionGet();
} catch (Exception e) {
System.out.println("Index already exists, skipping index creation");
}
ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet();
if (clusterHealthResponse.isTimedOut()) {
System.err.println("--> Timed out waiting for cluster health");
}
for (int i = 0; i < WARM + RUNS; ++i) {
if (i >= WARM) {
System.out.println("RUN " + (i - WARM));
}
long start = System.nanoTime();
SearchResponse resp = null;
for (int j = 0; j < ITERS; ++j) {
resp = client.prepareSearch("index").setQuery(QueryBuilders.prefixQuery("str", "sf")).setSize(0).addAggregation(terms("t").field("str").include("s.*")).execute().actionGet();
}
long end = System.nanoTime();
if (i >= WARM) {
System.out.println(new TimeValue((end - start) / ITERS, TimeUnit.NANOSECONDS));
}
}
}
}

View File

@ -387,86 +387,6 @@ public class StringTermsTests extends AbstractTermsTests {
}
}
@Test
public void singleValueField_WithRegexFiltering_WithFlags() throws Exception {
// include without exclude
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
// with case insensitive flag on the include regex
SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).include("VAL00.+", Pattern.CASE_INSENSITIVE))
.execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
// include and exclude
// we should be left with: val002, val003, val004, val005, val006, val007, val008, val009
// with multi-flag masking on the exclude regex
response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).include("val00.+").exclude("( val000 | VAL001 )#this is a comment", Pattern.CASE_INSENSITIVE | Pattern.COMMENTS))
.execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(8));
for (int i = 2; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
// exclude without include
// we should be left with: val000, val001, val002, val003, val004, val005, val006, val007, val008, val009
// with a "no flag" flag
response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).exclude("val0[1-9]+.+", 0))
.execute().actionGet();
assertSearchResponse(response);
terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(10));
for (int i = 0; i < 10; i++) {
Terms.Bucket bucket = terms.getBucketByKey("val00" + i);
assertThat(bucket, notNullValue());
assertThat(key(bucket), equalTo("val00" + i));
assertThat(bucket.getDocCount(), equalTo(1l));
}
}
@Test
public void singleValueField_WithExactTermFiltering() throws Exception {
// include without exclude