Aggregations Refactor: Refactor Sampler Aggregation
This commit is contained in:
parent
8f63c46d27
commit
1aea0faa86
|
@ -55,6 +55,7 @@ import org.elasticsearch.search.aggregations.bucket.range.geodistance.GeoDistanc
|
||||||
import org.elasticsearch.search.aggregations.bucket.range.geodistance.InternalGeoDistance;
|
import org.elasticsearch.search.aggregations.bucket.range.geodistance.InternalGeoDistance;
|
||||||
import org.elasticsearch.search.aggregations.bucket.range.ipv4.InternalIPv4Range;
|
import org.elasticsearch.search.aggregations.bucket.range.ipv4.InternalIPv4Range;
|
||||||
import org.elasticsearch.search.aggregations.bucket.range.ipv4.IpRangeParser;
|
import org.elasticsearch.search.aggregations.bucket.range.ipv4.IpRangeParser;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.DiversifiedSamplerParser;
|
||||||
import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
|
import org.elasticsearch.search.aggregations.bucket.sampler.InternalSampler;
|
||||||
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerParser;
|
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerParser;
|
||||||
import org.elasticsearch.search.aggregations.bucket.sampler.UnmappedSampler;
|
import org.elasticsearch.search.aggregations.bucket.sampler.UnmappedSampler;
|
||||||
|
@ -264,6 +265,7 @@ public class SearchModule extends AbstractModule {
|
||||||
multibinderAggParser.addBinding().to(FilterParser.class);
|
multibinderAggParser.addBinding().to(FilterParser.class);
|
||||||
multibinderAggParser.addBinding().to(FiltersParser.class);
|
multibinderAggParser.addBinding().to(FiltersParser.class);
|
||||||
multibinderAggParser.addBinding().to(SamplerParser.class);
|
multibinderAggParser.addBinding().to(SamplerParser.class);
|
||||||
|
multibinderAggParser.addBinding().to(DiversifiedSamplerParser.class);
|
||||||
multibinderAggParser.addBinding().to(TermsParser.class);
|
multibinderAggParser.addBinding().to(TermsParser.class);
|
||||||
multibinderAggParser.addBinding().to(SignificantTermsParser.class);
|
multibinderAggParser.addBinding().to(SignificantTermsParser.class);
|
||||||
multibinderAggParser.addBinding().to(RangeParser.class);
|
multibinderAggParser.addBinding().to(RangeParser.class);
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.search.aggregations.bucket.sampler;
|
||||||
|
|
||||||
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
|
import org.elasticsearch.search.aggregations.ValuesSourceAggregationBuilder;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builder for the {@link Sampler} aggregation.
|
||||||
|
*/
|
||||||
|
public class DiversifiedSamplerAggregationBuilder extends ValuesSourceAggregationBuilder<DiversifiedSamplerAggregationBuilder> {
|
||||||
|
|
||||||
|
private int shardSize = SamplerAggregator.Factory.DEFAULT_SHARD_SAMPLE_SIZE;
|
||||||
|
|
||||||
|
int maxDocsPerValue = SamplerAggregator.DiversifiedFactory.MAX_DOCS_PER_VALUE_DEFAULT;
|
||||||
|
String executionHint = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sole constructor.
|
||||||
|
*/
|
||||||
|
public DiversifiedSamplerAggregationBuilder(String name) {
|
||||||
|
super(name, SamplerAggregator.DiversifiedFactory.TYPE.name());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max num docs to be returned from each shard.
|
||||||
|
*/
|
||||||
|
public DiversifiedSamplerAggregationBuilder shardSize(int shardSize) {
|
||||||
|
this.shardSize = shardSize;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DiversifiedSamplerAggregationBuilder maxDocsPerValue(int maxDocsPerValue) {
|
||||||
|
this.maxDocsPerValue = maxDocsPerValue;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DiversifiedSamplerAggregationBuilder executionHint(String executionHint) {
|
||||||
|
this.executionHint = executionHint;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected XContentBuilder doInternalXContent(XContentBuilder builder, Params params) throws IOException {
|
||||||
|
if (shardSize != SamplerAggregator.Factory.DEFAULT_SHARD_SAMPLE_SIZE) {
|
||||||
|
builder.field(SamplerAggregator.SHARD_SIZE_FIELD.getPreferredName(), shardSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxDocsPerValue != SamplerAggregator.DiversifiedFactory.MAX_DOCS_PER_VALUE_DEFAULT) {
|
||||||
|
builder.field(SamplerAggregator.MAX_DOCS_PER_VALUE_FIELD.getPreferredName(), maxDocsPerValue);
|
||||||
|
}
|
||||||
|
if (executionHint != null) {
|
||||||
|
builder.field(SamplerAggregator.EXECUTION_HINT_FIELD.getPreferredName(), executionHint);
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,97 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.search.aggregations.bucket.sampler;
|
||||||
|
|
||||||
|
|
||||||
|
import org.elasticsearch.common.ParseField;
|
||||||
|
import org.elasticsearch.common.ParseFieldMatcher;
|
||||||
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
|
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
||||||
|
import org.elasticsearch.search.aggregations.support.AbstractValuesSourceParser.AnyValuesSourceParser;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValueType;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValuesSource;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class DiversifiedSamplerParser extends AnyValuesSourceParser {
|
||||||
|
|
||||||
|
public DiversifiedSamplerParser() {
|
||||||
|
super(true, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String type() {
|
||||||
|
return SamplerAggregator.DiversifiedFactory.TYPE.name();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ValuesSourceAggregatorFactory<ValuesSource> createFactory(String aggregationName, ValuesSourceType valuesSourceType,
|
||||||
|
ValueType targetValueType, Map<ParseField, Object> otherOptions) {
|
||||||
|
SamplerAggregator.DiversifiedFactory factory = new SamplerAggregator.DiversifiedFactory(aggregationName, valuesSourceType,
|
||||||
|
targetValueType);
|
||||||
|
Integer shardSize = (Integer) otherOptions.get(SamplerAggregator.SHARD_SIZE_FIELD);
|
||||||
|
if (shardSize != null) {
|
||||||
|
factory.shardSize(shardSize);
|
||||||
|
}
|
||||||
|
Integer maxDocsPerValue = (Integer) otherOptions.get(SamplerAggregator.MAX_DOCS_PER_VALUE_FIELD);
|
||||||
|
if (maxDocsPerValue != null) {
|
||||||
|
factory.maxDocsPerValue(maxDocsPerValue);
|
||||||
|
}
|
||||||
|
String executionHint = (String) otherOptions.get(SamplerAggregator.EXECUTION_HINT_FIELD);
|
||||||
|
if (executionHint != null) {
|
||||||
|
factory.executionHint(executionHint);
|
||||||
|
}
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean token(String aggregationName, String currentFieldName, XContentParser.Token token, XContentParser parser,
|
||||||
|
ParseFieldMatcher parseFieldMatcher, Map<ParseField, Object> otherOptions) throws IOException {
|
||||||
|
if (token == XContentParser.Token.VALUE_NUMBER) {
|
||||||
|
if (parseFieldMatcher.match(currentFieldName, SamplerAggregator.SHARD_SIZE_FIELD)) {
|
||||||
|
int shardSize = parser.intValue();
|
||||||
|
otherOptions.put(SamplerAggregator.SHARD_SIZE_FIELD, shardSize);
|
||||||
|
return true;
|
||||||
|
} else if (parseFieldMatcher.match(currentFieldName, SamplerAggregator.MAX_DOCS_PER_VALUE_FIELD)) {
|
||||||
|
int maxDocsPerValue = parser.intValue();
|
||||||
|
otherOptions.put(SamplerAggregator.MAX_DOCS_PER_VALUE_FIELD, maxDocsPerValue);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else if (token == XContentParser.Token.VALUE_STRING) {
|
||||||
|
if (parseFieldMatcher.match(currentFieldName, SamplerAggregator.EXECUTION_HINT_FIELD)) {
|
||||||
|
String executionHint = parser.text();
|
||||||
|
otherOptions.put(SamplerAggregator.EXECUTION_HINT_FIELD, executionHint);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public AggregatorFactory[] getFactoryPrototypes() {
|
||||||
|
return new AggregatorFactory[] { new SamplerAggregator.DiversifiedFactory(null, null, null) };
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -29,10 +29,7 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class SamplerAggregationBuilder extends ValuesSourceAggregationBuilder<SamplerAggregationBuilder> {
|
public class SamplerAggregationBuilder extends ValuesSourceAggregationBuilder<SamplerAggregationBuilder> {
|
||||||
|
|
||||||
private int shardSize = SamplerParser.DEFAULT_SHARD_SAMPLE_SIZE;
|
private int shardSize = SamplerAggregator.Factory.DEFAULT_SHARD_SAMPLE_SIZE;
|
||||||
|
|
||||||
int maxDocsPerValue = SamplerParser.MAX_DOCS_PER_VALUE_DEFAULT;
|
|
||||||
String executionHint = null;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sole constructor.
|
* Sole constructor.
|
||||||
|
@ -49,28 +46,10 @@ public class SamplerAggregationBuilder extends ValuesSourceAggregationBuilder<Sa
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SamplerAggregationBuilder maxDocsPerValue(int maxDocsPerValue) {
|
|
||||||
this.maxDocsPerValue = maxDocsPerValue;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SamplerAggregationBuilder executionHint(String executionHint) {
|
|
||||||
this.executionHint = executionHint;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected XContentBuilder doInternalXContent(XContentBuilder builder, Params params) throws IOException {
|
protected XContentBuilder doInternalXContent(XContentBuilder builder, Params params) throws IOException {
|
||||||
// builder.startObject();
|
if (shardSize != SamplerAggregator.Factory.DEFAULT_SHARD_SAMPLE_SIZE) {
|
||||||
if (shardSize != SamplerParser.DEFAULT_SHARD_SAMPLE_SIZE) {
|
builder.field(SamplerAggregator.SHARD_SIZE_FIELD.getPreferredName(), shardSize);
|
||||||
builder.field(SamplerParser.SHARD_SIZE_FIELD.getPreferredName(), shardSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (maxDocsPerValue != SamplerParser.MAX_DOCS_PER_VALUE_DEFAULT) {
|
|
||||||
builder.field(SamplerParser.MAX_DOCS_PER_VALUE_FIELD.getPreferredName(), maxDocsPerValue);
|
|
||||||
}
|
|
||||||
if (executionHint != null) {
|
|
||||||
builder.field(SamplerParser.EXECUTION_HINT_FIELD.getPreferredName(), executionHint);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder;
|
return builder;
|
||||||
|
|
|
@ -21,12 +21,16 @@ package org.elasticsearch.search.aggregations.bucket.sampler;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.elasticsearch.common.ParseField;
|
import org.elasticsearch.common.ParseField;
|
||||||
import org.elasticsearch.common.ParseFieldMatcher;
|
import org.elasticsearch.common.ParseFieldMatcher;
|
||||||
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
|
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||||
import org.elasticsearch.common.lease.Releasables;
|
import org.elasticsearch.common.lease.Releasables;
|
||||||
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.search.aggregations.AggregationExecutionException;
|
import org.elasticsearch.search.aggregations.AggregationExecutionException;
|
||||||
import org.elasticsearch.search.aggregations.Aggregator;
|
import org.elasticsearch.search.aggregations.Aggregator;
|
||||||
import org.elasticsearch.search.aggregations.AggregatorFactories;
|
import org.elasticsearch.search.aggregations.AggregatorFactories;
|
||||||
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
||||||
import org.elasticsearch.search.aggregations.InternalAggregation;
|
import org.elasticsearch.search.aggregations.InternalAggregation;
|
||||||
|
import org.elasticsearch.search.aggregations.InternalAggregation.Type;
|
||||||
import org.elasticsearch.search.aggregations.LeafBucketCollector;
|
import org.elasticsearch.search.aggregations.LeafBucketCollector;
|
||||||
import org.elasticsearch.search.aggregations.NonCollectingAggregator;
|
import org.elasticsearch.search.aggregations.NonCollectingAggregator;
|
||||||
import org.elasticsearch.search.aggregations.bucket.BestDocsDeferringCollector;
|
import org.elasticsearch.search.aggregations.bucket.BestDocsDeferringCollector;
|
||||||
|
@ -34,14 +38,16 @@ import org.elasticsearch.search.aggregations.bucket.DeferringBucketCollector;
|
||||||
import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregator;
|
import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregator;
|
||||||
import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator;
|
import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator;
|
||||||
import org.elasticsearch.search.aggregations.support.AggregationContext;
|
import org.elasticsearch.search.aggregations.support.AggregationContext;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValueType;
|
||||||
import org.elasticsearch.search.aggregations.support.ValuesSource;
|
import org.elasticsearch.search.aggregations.support.ValuesSource;
|
||||||
import org.elasticsearch.search.aggregations.support.ValuesSource.Numeric;
|
import org.elasticsearch.search.aggregations.support.ValuesSource.Numeric;
|
||||||
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
|
import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory;
|
||||||
import org.elasticsearch.search.aggregations.support.ValuesSourceParser;
|
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Aggregate on only the top-scoring docs on a shard.
|
* Aggregate on only the top-scoring docs on a shard.
|
||||||
|
@ -55,6 +61,10 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class SamplerAggregator extends SingleBucketAggregator {
|
public class SamplerAggregator extends SingleBucketAggregator {
|
||||||
|
|
||||||
|
public static final ParseField SHARD_SIZE_FIELD = new ParseField("shard_size");
|
||||||
|
public static final ParseField MAX_DOCS_PER_VALUE_FIELD = new ParseField("max_docs_per_value");
|
||||||
|
public static final ParseField EXECUTION_HINT_FIELD = new ParseField("execution_hint");
|
||||||
|
|
||||||
|
|
||||||
public enum ExecutionMode {
|
public enum ExecutionMode {
|
||||||
|
|
||||||
|
@ -182,34 +192,123 @@ public class SamplerAggregator extends SingleBucketAggregator {
|
||||||
|
|
||||||
public static class Factory extends AggregatorFactory {
|
public static class Factory extends AggregatorFactory {
|
||||||
|
|
||||||
private int shardSize;
|
public static final int DEFAULT_SHARD_SAMPLE_SIZE = 100;
|
||||||
|
|
||||||
public Factory(String name, int shardSize) {
|
private int shardSize = DEFAULT_SHARD_SAMPLE_SIZE;
|
||||||
|
|
||||||
|
public Factory(String name) {
|
||||||
super(name, InternalSampler.TYPE);
|
super(name, InternalSampler.TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max num docs to be returned from each shard.
|
||||||
|
*/
|
||||||
|
public void shardSize(int shardSize) {
|
||||||
this.shardSize = shardSize;
|
this.shardSize = shardSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the max num docs to be returned from each shard.
|
||||||
|
*/
|
||||||
|
public int shardSize() {
|
||||||
|
return shardSize;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Aggregator createInternal(AggregationContext context, Aggregator parent, boolean collectsFromSingleBucket,
|
public Aggregator createInternal(AggregationContext context, Aggregator parent, boolean collectsFromSingleBucket,
|
||||||
List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
|
List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
|
||||||
return new SamplerAggregator(name, shardSize, factories, context, parent, pipelineAggregators, metaData);
|
return new SamplerAggregator(name, shardSize, factories, context, parent, pipelineAggregators, metaData);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException {
|
||||||
|
builder.startObject();
|
||||||
|
builder.field(SHARD_SIZE_FIELD.getPreferredName(), shardSize);
|
||||||
|
builder.endObject();
|
||||||
|
return builder;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected AggregatorFactory doReadFrom(String name, StreamInput in) throws IOException {
|
||||||
|
Factory factory = new Factory(name);
|
||||||
|
factory.shardSize = in.readVInt();
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void doWriteTo(StreamOutput out) throws IOException {
|
||||||
|
out.writeVInt(shardSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int doHashCode() {
|
||||||
|
return Objects.hash(shardSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean doEquals(Object obj) {
|
||||||
|
Factory other = (Factory) obj;
|
||||||
|
return Objects.equals(shardSize, other.shardSize);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class DiversifiedFactory extends ValuesSourceAggregatorFactory<ValuesSource> {
|
public static class DiversifiedFactory extends ValuesSourceAggregatorFactory<ValuesSource> {
|
||||||
|
|
||||||
private int shardSize;
|
public static final Type TYPE = new Type("diversified_sampler");
|
||||||
private int maxDocsPerValue;
|
|
||||||
private String executionHint;
|
|
||||||
|
|
||||||
public DiversifiedFactory(String name, int shardSize, String executionHint, ValuesSourceParser.Input vsInput, int maxDocsPerValue) {
|
public static final int MAX_DOCS_PER_VALUE_DEFAULT = 1;
|
||||||
super(name, InternalSampler.TYPE, vsInput);
|
|
||||||
|
private int shardSize = Factory.DEFAULT_SHARD_SAMPLE_SIZE;
|
||||||
|
private int maxDocsPerValue = MAX_DOCS_PER_VALUE_DEFAULT;
|
||||||
|
private String executionHint = null;
|
||||||
|
|
||||||
|
public DiversifiedFactory(String name, ValuesSourceType valueSourceType, ValueType valueType) {
|
||||||
|
super(name, TYPE, valueSourceType, valueType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max num docs to be returned from each shard.
|
||||||
|
*/
|
||||||
|
public void shardSize(int shardSize) {
|
||||||
this.shardSize = shardSize;
|
this.shardSize = shardSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the max num docs to be returned from each shard.
|
||||||
|
*/
|
||||||
|
public int shardSize() {
|
||||||
|
return shardSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the max num docs to be returned per value.
|
||||||
|
*/
|
||||||
|
public void maxDocsPerValue(int maxDocsPerValue) {
|
||||||
this.maxDocsPerValue = maxDocsPerValue;
|
this.maxDocsPerValue = maxDocsPerValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the max num docs to be returned per value.
|
||||||
|
*/
|
||||||
|
public int maxDocsPerValue() {
|
||||||
|
return maxDocsPerValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the execution hint.
|
||||||
|
*/
|
||||||
|
public void executionHint(String executionHint) {
|
||||||
this.executionHint = executionHint;
|
this.executionHint = executionHint;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the execution hint.
|
||||||
|
*/
|
||||||
|
public String executionHint() {
|
||||||
|
return executionHint;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Aggregator doCreateInternal(ValuesSource valuesSource, AggregationContext context, Aggregator parent,
|
protected Aggregator doCreateInternal(ValuesSource valuesSource, AggregationContext context, Aggregator parent,
|
||||||
boolean collectsFromSingleBucket, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData)
|
boolean collectsFromSingleBucket, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData)
|
||||||
|
@ -256,6 +355,45 @@ public class SamplerAggregator extends SingleBucketAggregator {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException {
|
||||||
|
builder.field(SHARD_SIZE_FIELD.getPreferredName(), shardSize);
|
||||||
|
builder.field(MAX_DOCS_PER_VALUE_FIELD.getPreferredName(), maxDocsPerValue);
|
||||||
|
if (executionHint != null) {
|
||||||
|
builder.field(EXECUTION_HINT_FIELD.getPreferredName(), executionHint);
|
||||||
|
}
|
||||||
|
return builder;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected ValuesSourceAggregatorFactory<ValuesSource> innerReadFrom(String name, ValuesSourceType valuesSourceType,
|
||||||
|
ValueType targetValueType, StreamInput in) throws IOException {
|
||||||
|
DiversifiedFactory factory = new DiversifiedFactory(name, valuesSourceType, targetValueType);
|
||||||
|
factory.shardSize = in.readVInt();
|
||||||
|
factory.maxDocsPerValue = in.readVInt();
|
||||||
|
factory.executionHint = in.readOptionalString();
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void innerWriteTo(StreamOutput out) throws IOException {
|
||||||
|
out.writeVInt(shardSize);
|
||||||
|
out.writeVInt(maxDocsPerValue);
|
||||||
|
out.writeOptionalString(executionHint);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int innerHashCode() {
|
||||||
|
return Objects.hash(shardSize, maxDocsPerValue, executionHint);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected boolean innerEquals(Object obj) {
|
||||||
|
DiversifiedFactory other = (DiversifiedFactory) obj;
|
||||||
|
return Objects.equals(shardSize, other.shardSize)
|
||||||
|
&& Objects.equals(maxDocsPerValue, other.maxDocsPerValue)
|
||||||
|
&& Objects.equals(executionHint, other.executionHint);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -18,12 +18,11 @@
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.search.aggregations.bucket.sampler;
|
package org.elasticsearch.search.aggregations.bucket.sampler;
|
||||||
|
|
||||||
import org.elasticsearch.common.ParseField;
|
|
||||||
|
import org.elasticsearch.common.ParsingException;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.search.SearchParseException;
|
|
||||||
import org.elasticsearch.search.aggregations.Aggregator;
|
import org.elasticsearch.search.aggregations.Aggregator;
|
||||||
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
import org.elasticsearch.search.aggregations.AggregatorFactory;
|
||||||
import org.elasticsearch.search.aggregations.support.ValuesSourceParser;
|
|
||||||
import org.elasticsearch.search.internal.SearchContext;
|
import org.elasticsearch.search.internal.SearchContext;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -33,14 +32,6 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class SamplerParser implements Aggregator.Parser {
|
public class SamplerParser implements Aggregator.Parser {
|
||||||
|
|
||||||
public static final int DEFAULT_SHARD_SAMPLE_SIZE = 100;
|
|
||||||
public static final ParseField SHARD_SIZE_FIELD = new ParseField("shard_size");
|
|
||||||
public static final ParseField MAX_DOCS_PER_VALUE_FIELD = new ParseField("max_docs_per_value");
|
|
||||||
public static final ParseField EXECUTION_HINT_FIELD = new ParseField("execution_hint");
|
|
||||||
public static final boolean DEFAULT_USE_GLOBAL_ORDINALS = false;
|
|
||||||
public static final int MAX_DOCS_PER_VALUE_DEFAULT = 1;
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String type() {
|
public String type() {
|
||||||
return InternalSampler.TYPE.name();
|
return InternalSampler.TYPE.name();
|
||||||
|
@ -51,60 +42,34 @@ public class SamplerParser implements Aggregator.Parser {
|
||||||
|
|
||||||
XContentParser.Token token;
|
XContentParser.Token token;
|
||||||
String currentFieldName = null;
|
String currentFieldName = null;
|
||||||
String executionHint = null;
|
Integer shardSize = null;
|
||||||
int shardSize = DEFAULT_SHARD_SAMPLE_SIZE;
|
|
||||||
int maxDocsPerValue = MAX_DOCS_PER_VALUE_DEFAULT;
|
|
||||||
ValuesSourceParser vsParser = null;
|
|
||||||
boolean diversityChoiceMade = false;
|
|
||||||
|
|
||||||
vsParser = ValuesSourceParser.any(aggregationName, InternalSampler.TYPE, context).scriptable(true).formattable(false).build();
|
|
||||||
|
|
||||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||||
if (token == XContentParser.Token.FIELD_NAME) {
|
if (token == XContentParser.Token.FIELD_NAME) {
|
||||||
currentFieldName = parser.currentName();
|
currentFieldName = parser.currentName();
|
||||||
} else if (vsParser.token(currentFieldName, token, parser)) {
|
|
||||||
continue;
|
|
||||||
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
||||||
if (context.parseFieldMatcher().match(currentFieldName, SHARD_SIZE_FIELD)) {
|
if (context.parseFieldMatcher().match(currentFieldName, SamplerAggregator.SHARD_SIZE_FIELD)) {
|
||||||
shardSize = parser.intValue();
|
shardSize = parser.intValue();
|
||||||
} else if (context.parseFieldMatcher().match(currentFieldName, MAX_DOCS_PER_VALUE_FIELD)) {
|
|
||||||
diversityChoiceMade = true;
|
|
||||||
maxDocsPerValue = parser.intValue();
|
|
||||||
} else {
|
} else {
|
||||||
throw new SearchParseException(context, "Unsupported property \"" + currentFieldName + "\" for aggregation \""
|
throw new ParsingException(parser.getTokenLocation(),
|
||||||
+ aggregationName, parser.getTokenLocation());
|
"Unsupported property \"" + currentFieldName + "\" for aggregation \"" + aggregationName);
|
||||||
}
|
|
||||||
} else if (!vsParser.token(currentFieldName, token, parser)) {
|
|
||||||
if (context.parseFieldMatcher().match(currentFieldName, EXECUTION_HINT_FIELD)) {
|
|
||||||
executionHint = parser.text();
|
|
||||||
} else {
|
|
||||||
throw new SearchParseException(context, "Unexpected token " + token + " in [" + aggregationName + "].",
|
|
||||||
parser.getTokenLocation());
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
throw new SearchParseException(context, "Unsupported property \"" + currentFieldName + "\" for aggregation \""
|
throw new ParsingException(parser.getTokenLocation(),
|
||||||
+ aggregationName, parser.getTokenLocation());
|
"Unsupported property \"" + currentFieldName + "\" for aggregation \"" + aggregationName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ValuesSourceParser.Input vsInput = vsParser.input();
|
SamplerAggregator.Factory factory = new SamplerAggregator.Factory(aggregationName);
|
||||||
if (vsInput.valid()) {
|
if (shardSize != null) {
|
||||||
return new SamplerAggregator.DiversifiedFactory(aggregationName, shardSize, executionHint, vsInput, maxDocsPerValue);
|
factory.shardSize(shardSize);
|
||||||
} else {
|
|
||||||
if (diversityChoiceMade) {
|
|
||||||
throw new SearchParseException(context, "Sampler aggregation has " + MAX_DOCS_PER_VALUE_FIELD.getPreferredName()
|
|
||||||
+ " setting but no \"field\" or \"script\" setting to provide values for aggregation \"" + aggregationName + "\"",
|
|
||||||
parser.getTokenLocation());
|
|
||||||
|
|
||||||
}
|
|
||||||
return new SamplerAggregator.Factory(aggregationName, shardSize);
|
|
||||||
}
|
}
|
||||||
|
return factory;
|
||||||
}
|
}
|
||||||
|
|
||||||
// NORELEASE implement this method when refactoring this aggregation
|
|
||||||
@Override
|
@Override
|
||||||
public AggregatorFactory[] getFactoryPrototypes() {
|
public AggregatorFactory[] getFactoryPrototypes() {
|
||||||
return null;
|
return new AggregatorFactory[] { new SamplerAggregator.Factory(null) };
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -433,7 +433,7 @@ public abstract class ValuesSourceAggregatorFactory<VS extends ValuesSource> ext
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final int doHashCode() {
|
protected final int doHashCode() {
|
||||||
return Objects.hash(field, format, missing, script, targetValueType, timeZone, valueType, valuesSourceType,
|
return Objects.hash(field, format, missing, script, targetValueType, timeZone, valueType, valuesSourceType,
|
||||||
innerHashCode());
|
innerHashCode());
|
||||||
}
|
}
|
||||||
|
@ -446,7 +446,7 @@ public abstract class ValuesSourceAggregatorFactory<VS extends ValuesSource> ext
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean doEquals(Object obj) {
|
protected final boolean doEquals(Object obj) {
|
||||||
ValuesSourceAggregatorFactory<?> other = (ValuesSourceAggregatorFactory<?>) obj;
|
ValuesSourceAggregatorFactory<?> other = (ValuesSourceAggregatorFactory<?>) obj;
|
||||||
if (!Objects.equals(field, other.field))
|
if (!Objects.equals(field, other.field))
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -0,0 +1,238 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.search.aggregations.bucket;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
|
||||||
|
import org.elasticsearch.action.search.SearchResponse;
|
||||||
|
import org.elasticsearch.action.search.SearchType;
|
||||||
|
import org.elasticsearch.index.query.TermQueryBuilder;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.DiversifiedSamplerAggregationBuilder;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.Sampler;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregator;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.terms.Terms;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.terms.TermsBuilder;
|
||||||
|
import org.elasticsearch.search.aggregations.metrics.max.Max;
|
||||||
|
import org.elasticsearch.test.ESIntegTestCase;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
|
||||||
|
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
|
||||||
|
import static org.elasticsearch.search.aggregations.AggregationBuilders.max;
|
||||||
|
import static org.elasticsearch.search.aggregations.AggregationBuilders.sampler;
|
||||||
|
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.hamcrest.Matchers.greaterThan;
|
||||||
|
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
|
||||||
|
import static org.hamcrest.Matchers.lessThanOrEqualTo;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests the Sampler aggregation
|
||||||
|
*/
|
||||||
|
@ESIntegTestCase.SuiteScopeTestCase
|
||||||
|
public class DiversifiedSamplerIT extends ESIntegTestCase {
|
||||||
|
|
||||||
|
public static final int NUM_SHARDS = 2;
|
||||||
|
|
||||||
|
public String randomExecutionHint() {
|
||||||
|
return randomBoolean() ? null : randomFrom(SamplerAggregator.ExecutionMode.values()).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setupSuiteScopeCluster() throws Exception {
|
||||||
|
assertAcked(prepareCreate("test").setSettings(SETTING_NUMBER_OF_SHARDS, NUM_SHARDS, SETTING_NUMBER_OF_REPLICAS, 0).addMapping(
|
||||||
|
"book", "author", "type=string,index=not_analyzed", "name", "type=string,index=analyzed", "genre",
|
||||||
|
"type=string,index=not_analyzed", "price", "type=float"));
|
||||||
|
createIndex("idx_unmapped");
|
||||||
|
// idx_unmapped_author is same as main index but missing author field
|
||||||
|
assertAcked(prepareCreate("idx_unmapped_author").setSettings(SETTING_NUMBER_OF_SHARDS, NUM_SHARDS, SETTING_NUMBER_OF_REPLICAS, 0)
|
||||||
|
.addMapping("book", "name", "type=string,index=analyzed", "genre", "type=string,index=not_analyzed", "price", "type=float"));
|
||||||
|
|
||||||
|
ensureGreen();
|
||||||
|
String data[] = {
|
||||||
|
// "id,cat,name,price,inStock,author_t,series_t,sequence_i,genre_s",
|
||||||
|
"0553573403,book,A Game of Thrones,7.99,true,George R.R. Martin,A Song of Ice and Fire,1,fantasy",
|
||||||
|
"0553579908,book,A Clash of Kings,7.99,true,George R.R. Martin,A Song of Ice and Fire,2,fantasy",
|
||||||
|
"055357342X,book,A Storm of Swords,7.99,true,George R.R. Martin,A Song of Ice and Fire,3,fantasy",
|
||||||
|
"0553293354,book,Foundation,17.99,true,Isaac Asimov,Foundation Novels,1,scifi",
|
||||||
|
"0812521390,book,The Black Company,6.99,false,Glen Cook,The Chronicles of The Black Company,1,fantasy",
|
||||||
|
"0812550706,book,Ender's Game,6.99,true,Orson Scott Card,Ender,1,scifi",
|
||||||
|
"0441385532,book,Jhereg,7.95,false,Steven Brust,Vlad Taltos,1,fantasy",
|
||||||
|
"0380014300,book,Nine Princes In Amber,6.99,true,Roger Zelazny,the Chronicles of Amber,1,fantasy",
|
||||||
|
"0805080481,book,The Book of Three,5.99,true,Lloyd Alexander,The Chronicles of Prydain,1,fantasy",
|
||||||
|
"080508049X,book,The Black Cauldron,5.99,true,Lloyd Alexander,The Chronicles of Prydain,2,fantasy"
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 0; i < data.length; i++) {
|
||||||
|
String[] parts = data[i].split(",");
|
||||||
|
client().prepareIndex("test", "book", "" + i).setSource("author", parts[5], "name", parts[2], "genre", parts[8], "price",Float.parseFloat(parts[3])).get();
|
||||||
|
client().prepareIndex("idx_unmapped_author", "book", "" + i).setSource("name", parts[2], "genre", parts[8],"price",Float.parseFloat(parts[3])).get();
|
||||||
|
}
|
||||||
|
client().admin().indices().refresh(new RefreshRequest("test")).get();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testIssue10719() throws Exception {
|
||||||
|
// Tests that we can refer to nested elements under a sample in a path
|
||||||
|
// statement
|
||||||
|
boolean asc = randomBoolean();
|
||||||
|
SearchResponse response = client().prepareSearch("test").setTypes("book").setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.addAggregation(terms("genres")
|
||||||
|
.field("genre")
|
||||||
|
.order(Terms.Order.aggregation("sample>max_price.value", asc))
|
||||||
|
.subAggregation(sampler("sample").shardSize(100)
|
||||||
|
.subAggregation(max("max_price").field("price")))
|
||||||
|
).execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Terms genres = response.getAggregations().get("genres");
|
||||||
|
Collection<Bucket> genreBuckets = genres.getBuckets();
|
||||||
|
// For this test to be useful we need >1 genre bucket to compare
|
||||||
|
assertThat(genreBuckets.size(), greaterThan(1));
|
||||||
|
double lastMaxPrice = asc ? Double.MIN_VALUE : Double.MAX_VALUE;
|
||||||
|
for (Terms.Bucket genreBucket : genres.getBuckets()) {
|
||||||
|
Sampler sample = genreBucket.getAggregations().get("sample");
|
||||||
|
Max maxPriceInGenre = sample.getAggregations().get("max_price");
|
||||||
|
double price = maxPriceInGenre.getValue();
|
||||||
|
if (asc) {
|
||||||
|
assertThat(price, greaterThanOrEqualTo(lastMaxPrice));
|
||||||
|
} else {
|
||||||
|
assertThat(price, lessThanOrEqualTo(lastMaxPrice));
|
||||||
|
}
|
||||||
|
lastMaxPrice = price;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleDiversity() throws Exception {
|
||||||
|
int MAX_DOCS_PER_AUTHOR = 1;
|
||||||
|
DiversifiedSamplerAggregationBuilder sampleAgg = new DiversifiedSamplerAggregationBuilder("sample").shardSize(100);
|
||||||
|
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
SearchResponse response = client().prepareSearch("test")
|
||||||
|
.setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.setQuery(new TermQueryBuilder("genre", "fantasy"))
|
||||||
|
.setFrom(0).setSize(60)
|
||||||
|
.addAggregation(sampleAgg)
|
||||||
|
.execute()
|
||||||
|
.actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Sampler sample = response.getAggregations().get("sample");
|
||||||
|
Terms authors = sample.getAggregations().get("authors");
|
||||||
|
Collection<Bucket> testBuckets = authors.getBuckets();
|
||||||
|
|
||||||
|
for (Terms.Bucket testBucket : testBuckets) {
|
||||||
|
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNestedDiversity() throws Exception {
|
||||||
|
// Test multiple samples gathered under buckets made by a parent agg
|
||||||
|
int MAX_DOCS_PER_AUTHOR = 1;
|
||||||
|
TermsBuilder rootTerms = new TermsBuilder("genres").field("genre");
|
||||||
|
|
||||||
|
DiversifiedSamplerAggregationBuilder sampleAgg = new DiversifiedSamplerAggregationBuilder("sample").shardSize(100);
|
||||||
|
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
|
||||||
|
rootTerms.subAggregation(sampleAgg);
|
||||||
|
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.addAggregation(rootTerms).execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Terms genres = response.getAggregations().get("genres");
|
||||||
|
Collection<Bucket> genreBuckets = genres.getBuckets();
|
||||||
|
for (Terms.Bucket genreBucket : genreBuckets) {
|
||||||
|
Sampler sample = genreBucket.getAggregations().get("sample");
|
||||||
|
Terms authors = sample.getAggregations().get("authors");
|
||||||
|
Collection<Bucket> testBuckets = authors.getBuckets();
|
||||||
|
|
||||||
|
for (Terms.Bucket testBucket : testBuckets) {
|
||||||
|
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNestedSamples() throws Exception {
|
||||||
|
// Test samples nested under samples
|
||||||
|
int MAX_DOCS_PER_AUTHOR = 1;
|
||||||
|
int MAX_DOCS_PER_GENRE = 2;
|
||||||
|
DiversifiedSamplerAggregationBuilder rootSample = new DiversifiedSamplerAggregationBuilder("genreSample").shardSize(100)
|
||||||
|
.field("genre")
|
||||||
|
.maxDocsPerValue(MAX_DOCS_PER_GENRE);
|
||||||
|
|
||||||
|
DiversifiedSamplerAggregationBuilder sampleAgg = new DiversifiedSamplerAggregationBuilder("sample").shardSize(100);
|
||||||
|
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("genres").field("genre"));
|
||||||
|
|
||||||
|
rootSample.subAggregation(sampleAgg);
|
||||||
|
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH).addAggregation(rootSample)
|
||||||
|
.execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Sampler genreSample = response.getAggregations().get("genreSample");
|
||||||
|
Sampler sample = genreSample.getAggregations().get("sample");
|
||||||
|
|
||||||
|
Terms genres = sample.getAggregations().get("genres");
|
||||||
|
Collection<Bucket> testBuckets = genres.getBuckets();
|
||||||
|
for (Terms.Bucket testBucket : testBuckets) {
|
||||||
|
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_GENRE));
|
||||||
|
}
|
||||||
|
|
||||||
|
Terms authors = sample.getAggregations().get("authors");
|
||||||
|
testBuckets = authors.getBuckets();
|
||||||
|
for (Terms.Bucket testBucket : testBuckets) {
|
||||||
|
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testPartiallyUnmappedDiversifyField() throws Exception {
|
||||||
|
// One of the indexes is missing the "author" field used for
|
||||||
|
// diversifying results
|
||||||
|
DiversifiedSamplerAggregationBuilder sampleAgg = new DiversifiedSamplerAggregationBuilder("sample").shardSize(100).field("author")
|
||||||
|
.maxDocsPerValue(1);
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
SearchResponse response = client().prepareSearch("idx_unmapped_author", "test").setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.setQuery(new TermQueryBuilder("genre", "fantasy")).setFrom(0).setSize(60).addAggregation(sampleAgg)
|
||||||
|
.execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Sampler sample = response.getAggregations().get("sample");
|
||||||
|
assertThat(sample.getDocCount(), greaterThan(0l));
|
||||||
|
Terms authors = sample.getAggregations().get("authors");
|
||||||
|
assertThat(authors.getBuckets().size(), greaterThan(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testWhollyUnmappedDiversifyField() throws Exception {
|
||||||
|
//All of the indices are missing the "author" field used for diversifying results
|
||||||
|
int MAX_DOCS_PER_AUTHOR = 1;
|
||||||
|
DiversifiedSamplerAggregationBuilder sampleAgg = new DiversifiedSamplerAggregationBuilder("sample").shardSize(100);
|
||||||
|
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
||||||
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
SearchResponse response = client().prepareSearch("idx_unmapped", "idx_unmapped_author").setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
.setQuery(new TermQueryBuilder("genre", "fantasy")).setFrom(0).setSize(60).addAggregation(sampleAgg).execute().actionGet();
|
||||||
|
assertSearchResponse(response);
|
||||||
|
Sampler sample = response.getAggregations().get("sample");
|
||||||
|
assertThat(sample.getDocCount(), equalTo(0l));
|
||||||
|
Terms authors = sample.getAggregations().get("authors");
|
||||||
|
assertNull(authors);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,63 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.search.aggregations.bucket;
|
||||||
|
|
||||||
|
import org.elasticsearch.script.Script;
|
||||||
|
import org.elasticsearch.search.aggregations.BaseAggregationTestCase;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregator;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregator.ExecutionMode;
|
||||||
|
import org.elasticsearch.search.aggregations.support.ValuesSourceType;
|
||||||
|
|
||||||
|
public class DiversifiedSamplerTests extends BaseAggregationTestCase<SamplerAggregator.DiversifiedFactory> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final SamplerAggregator.DiversifiedFactory createTestAggregatorFactory() {
|
||||||
|
SamplerAggregator.DiversifiedFactory factory = new SamplerAggregator.DiversifiedFactory("foo", ValuesSourceType.ANY,
|
||||||
|
null);
|
||||||
|
String field = randomNumericField();
|
||||||
|
int randomFieldBranch = randomInt(3);
|
||||||
|
switch (randomFieldBranch) {
|
||||||
|
case 0:
|
||||||
|
factory.field(field);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
factory.field(field);
|
||||||
|
factory.script(new Script("_value + 1"));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
factory.script(new Script("doc[" + field + "] + 1"));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
factory.missing("MISSING");
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
factory.maxDocsPerValue(randomIntBetween(1, 1000));
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
factory.shardSize(randomIntBetween(1, 1000));
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
factory.executionHint(randomFrom(ExecutionMode.values()).toString());
|
||||||
|
}
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -123,7 +123,7 @@ public class SamplerIT extends ESIntegTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNoDiversity() throws Exception {
|
public void testSimpleSampler() throws Exception {
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH)
|
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH)
|
||||||
|
@ -140,86 +140,6 @@ public class SamplerIT extends ESIntegTestCase {
|
||||||
assertThat(maxBooksPerAuthor, equalTo(3l));
|
assertThat(maxBooksPerAuthor, equalTo(3l));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSimpleDiversity() throws Exception {
|
|
||||||
int MAX_DOCS_PER_AUTHOR = 1;
|
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
|
||||||
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
|
||||||
SearchResponse response = client().prepareSearch("test")
|
|
||||||
.setSearchType(SearchType.QUERY_AND_FETCH)
|
|
||||||
.setQuery(new TermQueryBuilder("genre", "fantasy"))
|
|
||||||
.setFrom(0).setSize(60)
|
|
||||||
.addAggregation(sampleAgg)
|
|
||||||
.execute()
|
|
||||||
.actionGet();
|
|
||||||
assertSearchResponse(response);
|
|
||||||
Sampler sample = response.getAggregations().get("sample");
|
|
||||||
Terms authors = sample.getAggregations().get("authors");
|
|
||||||
Collection<Bucket> testBuckets = authors.getBuckets();
|
|
||||||
|
|
||||||
for (Terms.Bucket testBucket : testBuckets) {
|
|
||||||
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testNestedDiversity() throws Exception {
|
|
||||||
// Test multiple samples gathered under buckets made by a parent agg
|
|
||||||
int MAX_DOCS_PER_AUTHOR = 1;
|
|
||||||
TermsBuilder rootTerms = new TermsBuilder("genres").field("genre");
|
|
||||||
|
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
|
||||||
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
|
||||||
|
|
||||||
rootTerms.subAggregation(sampleAgg);
|
|
||||||
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH)
|
|
||||||
.addAggregation(rootTerms).execute().actionGet();
|
|
||||||
assertSearchResponse(response);
|
|
||||||
Terms genres = response.getAggregations().get("genres");
|
|
||||||
Collection<Bucket> genreBuckets = genres.getBuckets();
|
|
||||||
for (Terms.Bucket genreBucket : genreBuckets) {
|
|
||||||
Sampler sample = genreBucket.getAggregations().get("sample");
|
|
||||||
Terms authors = sample.getAggregations().get("authors");
|
|
||||||
Collection<Bucket> testBuckets = authors.getBuckets();
|
|
||||||
|
|
||||||
for (Terms.Bucket testBucket : testBuckets) {
|
|
||||||
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testNestedSamples() throws Exception {
|
|
||||||
// Test samples nested under samples
|
|
||||||
int MAX_DOCS_PER_AUTHOR = 1;
|
|
||||||
int MAX_DOCS_PER_GENRE = 2;
|
|
||||||
SamplerAggregationBuilder rootSample = new SamplerAggregationBuilder("genreSample").shardSize(100).field("genre")
|
|
||||||
.maxDocsPerValue(MAX_DOCS_PER_GENRE);
|
|
||||||
|
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
|
||||||
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("genres").field("genre"));
|
|
||||||
|
|
||||||
rootSample.subAggregation(sampleAgg);
|
|
||||||
SearchResponse response = client().prepareSearch("test").setSearchType(SearchType.QUERY_AND_FETCH).addAggregation(rootSample)
|
|
||||||
.execute().actionGet();
|
|
||||||
assertSearchResponse(response);
|
|
||||||
Sampler genreSample = response.getAggregations().get("genreSample");
|
|
||||||
Sampler sample = genreSample.getAggregations().get("sample");
|
|
||||||
|
|
||||||
Terms genres = sample.getAggregations().get("genres");
|
|
||||||
Collection<Bucket> testBuckets = genres.getBuckets();
|
|
||||||
for (Terms.Bucket testBucket : testBuckets) {
|
|
||||||
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_GENRE));
|
|
||||||
}
|
|
||||||
|
|
||||||
Terms authors = sample.getAggregations().get("authors");
|
|
||||||
testBuckets = authors.getBuckets();
|
|
||||||
for (Terms.Bucket testBucket : testBuckets) {
|
|
||||||
assertThat(testBucket.getDocCount(), lessThanOrEqualTo((long) NUM_SHARDS * MAX_DOCS_PER_AUTHOR));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testUnmappedChildAggNoDiversity() throws Exception {
|
public void testUnmappedChildAggNoDiversity() throws Exception {
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
||||||
|
@ -254,34 +174,4 @@ public class SamplerIT extends ESIntegTestCase {
|
||||||
assertThat(authors.getBuckets().size(), greaterThan(0));
|
assertThat(authors.getBuckets().size(), greaterThan(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testPartiallyUnmappedDiversifyField() throws Exception {
|
|
||||||
// One of the indexes is missing the "author" field used for
|
|
||||||
// diversifying results
|
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100).field("author").maxDocsPerValue(1);
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
|
||||||
SearchResponse response = client().prepareSearch("idx_unmapped_author", "test").setSearchType(SearchType.QUERY_AND_FETCH)
|
|
||||||
.setQuery(new TermQueryBuilder("genre", "fantasy")).setFrom(0).setSize(60).addAggregation(sampleAgg)
|
|
||||||
.execute().actionGet();
|
|
||||||
assertSearchResponse(response);
|
|
||||||
Sampler sample = response.getAggregations().get("sample");
|
|
||||||
assertThat(sample.getDocCount(), greaterThan(0l));
|
|
||||||
Terms authors = sample.getAggregations().get("authors");
|
|
||||||
assertThat(authors.getBuckets().size(), greaterThan(0));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testWhollyUnmappedDiversifyField() throws Exception {
|
|
||||||
//All of the indices are missing the "author" field used for diversifying results
|
|
||||||
int MAX_DOCS_PER_AUTHOR = 1;
|
|
||||||
SamplerAggregationBuilder sampleAgg = new SamplerAggregationBuilder("sample").shardSize(100);
|
|
||||||
sampleAgg.field("author").maxDocsPerValue(MAX_DOCS_PER_AUTHOR).executionHint(randomExecutionHint());
|
|
||||||
sampleAgg.subAggregation(new TermsBuilder("authors").field("author"));
|
|
||||||
SearchResponse response = client().prepareSearch("idx_unmapped", "idx_unmapped_author").setSearchType(SearchType.QUERY_AND_FETCH)
|
|
||||||
.setQuery(new TermQueryBuilder("genre", "fantasy")).setFrom(0).setSize(60).addAggregation(sampleAgg).execute().actionGet();
|
|
||||||
assertSearchResponse(response);
|
|
||||||
Sampler sample = response.getAggregations().get("sample");
|
|
||||||
assertThat(sample.getDocCount(), equalTo(0l));
|
|
||||||
Terms authors = sample.getAggregations().get("authors");
|
|
||||||
assertNull(authors);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.search.aggregations.bucket;
|
||||||
|
|
||||||
|
import org.elasticsearch.search.aggregations.BaseAggregationTestCase;
|
||||||
|
import org.elasticsearch.search.aggregations.bucket.sampler.SamplerAggregator;
|
||||||
|
|
||||||
|
public class SamplerTests extends BaseAggregationTestCase<SamplerAggregator.Factory> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected final SamplerAggregator.Factory createTestAggregatorFactory() {
|
||||||
|
SamplerAggregator.Factory factory = new SamplerAggregator.Factory("foo");
|
||||||
|
if (randomBoolean()) {
|
||||||
|
factory.shardSize(randomIntBetween(1, 1000));
|
||||||
|
}
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -19,6 +19,8 @@ include::bucket/datehistogram-aggregation.asciidoc[]
|
||||||
|
|
||||||
include::bucket/daterange-aggregation.asciidoc[]
|
include::bucket/daterange-aggregation.asciidoc[]
|
||||||
|
|
||||||
|
include::bucket/diversified-sampler-aggregation.asciidoc[]
|
||||||
|
|
||||||
include::bucket/filter-aggregation.asciidoc[]
|
include::bucket/filter-aggregation.asciidoc[]
|
||||||
|
|
||||||
include::bucket/filters-aggregation.asciidoc[]
|
include::bucket/filters-aggregation.asciidoc[]
|
||||||
|
|
|
@ -0,0 +1,154 @@
|
||||||
|
[[search-aggregations-bucket-sampler-aggregation]]
|
||||||
|
=== Sampler Aggregation
|
||||||
|
|
||||||
|
experimental[]
|
||||||
|
|
||||||
|
A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents. Diversity settings are
|
||||||
|
used to limit the number of matches that share a common value such as an "author".
|
||||||
|
|
||||||
|
.Example use cases:
|
||||||
|
* Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
|
||||||
|
* Removing bias from analytics by ensuring fair representation of content from different sources
|
||||||
|
* Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
|
||||||
|
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"query": {
|
||||||
|
"match": {
|
||||||
|
"text": "iphone"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"sample": {
|
||||||
|
"sampler": {
|
||||||
|
"shard_size": 200,
|
||||||
|
"field" : "user.id"
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"keywords": {
|
||||||
|
"significant_terms": {
|
||||||
|
"field": "text"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
Response:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
...
|
||||||
|
"aggregations": {
|
||||||
|
"sample": {
|
||||||
|
"doc_count": 1000,<1>
|
||||||
|
"keywords": {<2>
|
||||||
|
"doc_count": 1000,
|
||||||
|
"buckets": [
|
||||||
|
...
|
||||||
|
{
|
||||||
|
"key": "bend",
|
||||||
|
"doc_count": 58,
|
||||||
|
"score": 37.982536582524276,
|
||||||
|
"bg_count": 103
|
||||||
|
},
|
||||||
|
....
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
<1> 1000 documents were sampled in total becase we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
|
||||||
|
<2> The results of the significant_terms aggregation are not skewed by any single over-active Twitter user because we asked for a maximum of one tweet from any one user in our sample.
|
||||||
|
|
||||||
|
|
||||||
|
==== shard_size
|
||||||
|
|
||||||
|
The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
|
||||||
|
The default value is 100.
|
||||||
|
|
||||||
|
==== Controlling diversity
|
||||||
|
=`field` or `script` and `max_docs_per_value` settings are used to control the maximum number of documents collected on any one shard which share a common value.
|
||||||
|
The choice of value (e.g. `author`) is loaded from a regular `field` or derived dynamically by a `script`.
|
||||||
|
|
||||||
|
The aggregation will throw an error if the choice of field or script produces multiple values for a document.
|
||||||
|
It is currently not possible to offer this form of de-duplication using many values, primarily due to concerns over efficiency.
|
||||||
|
|
||||||
|
NOTE: Any good market researcher will tell you that when working with samples of data it is important
|
||||||
|
that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
|
||||||
|
The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer).
|
||||||
|
|
||||||
|
==== Field
|
||||||
|
|
||||||
|
Controlling diversity using a field:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"aggs" : {
|
||||||
|
"sample" : {
|
||||||
|
"diverisfied_sampler" : {
|
||||||
|
"field" : "author",
|
||||||
|
"max_docs_per_value" : 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
|
||||||
|
Note that the `max_docs_per_value` setting applies on a per-shard basis only for the purposes of shard-local sampling.
|
||||||
|
It is not intended as a way of providing a global de-duplication feature on search results.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
==== Script
|
||||||
|
|
||||||
|
Controlling diversity using a script:
|
||||||
|
|
||||||
|
[source,js]
|
||||||
|
--------------------------------------------------
|
||||||
|
{
|
||||||
|
"aggs" : {
|
||||||
|
"sample" : {
|
||||||
|
"diverisfied_sampler" : {
|
||||||
|
"script" : "doc['author'].value + '/' + doc['genre'].value"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
--------------------------------------------------
|
||||||
|
Note in the above example we chose to use the default `max_docs_per_value` setting of 1 and combine author and genre fields to ensure
|
||||||
|
each shard sample has, at most, one match for an author/genre pair.
|
||||||
|
|
||||||
|
|
||||||
|
==== execution_hint
|
||||||
|
|
||||||
|
When using the settings to control diversity, the optional `execution_hint` setting can influence the management of the values used for de-duplication.
|
||||||
|
Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows:
|
||||||
|
|
||||||
|
- hold field values directly (`map`)
|
||||||
|
- hold ordinals of the field as determined by the Lucene index (`global_ordinals`)
|
||||||
|
- hold hashes of the field values - with potential for hash collisions (`bytes_hash`)
|
||||||
|
|
||||||
|
The default setting is to use `global_ordinals` if this information is available from the Lucene index and reverting to `map` if not.
|
||||||
|
The `bytes_hash` setting may prove faster in some cases but introduces the possibility of false positives in de-duplication logic due to the possibility of hash collisions.
|
||||||
|
Please note that Elasticsearch will ignore the choice of execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints.
|
||||||
|
|
||||||
|
==== Limitations
|
||||||
|
|
||||||
|
===== Cannot be nested under `breadth_first` aggregations
|
||||||
|
Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document.
|
||||||
|
It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores.
|
||||||
|
In this situation an error will be thrown.
|
||||||
|
|
||||||
|
===== Limited de-dup logic.
|
||||||
|
The de-duplication logic in the diversify settings applies only at a shard level so will not apply across shards.
|
||||||
|
|
||||||
|
===== No specialized syntax for geo/date fields
|
||||||
|
Currently the syntax for defining the diversifying values is defined by a choice of `field` or `script` - there is no added syntactical sugar for expressing geo or date units such as "1w" (1 week).
|
||||||
|
This support may be added in a later release and users will currently have to create these sorts of values using a script.
|
|
@ -4,11 +4,9 @@
|
||||||
experimental[]
|
experimental[]
|
||||||
|
|
||||||
A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents.
|
A filtering aggregation used to limit any sub aggregations' processing to a sample of the top-scoring documents.
|
||||||
Optionally, diversity settings can be used to limit the number of matches that share a common value such as an "author".
|
|
||||||
|
|
||||||
.Example use cases:
|
.Example use cases:
|
||||||
* Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
|
* Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
|
||||||
* Removing bias from analytics by ensuring fair representation of content from different sources
|
|
||||||
* Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
|
* Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,8 +23,7 @@ Example:
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"sample": {
|
"sample": {
|
||||||
"sampler": {
|
"sampler": {
|
||||||
"shard_size": 200,
|
"shard_size": 200
|
||||||
"field" : "user.id"
|
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"keywords": {
|
"keywords": {
|
||||||
|
@ -63,8 +60,7 @@ Response:
|
||||||
}
|
}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
<1> 1000 documents were sampled in total becase we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
|
<1> 1000 documents were sampled in total because we asked for a maximum of 200 from an index with 5 shards. The cost of performing the nested significant_terms aggregation was therefore limited rather than unbounded.
|
||||||
<2> The results of the significant_terms aggregation are not skewed by any single over-active Twitter user because we asked for a maximum of one tweet from any one user in our sample.
|
|
||||||
|
|
||||||
|
|
||||||
==== shard_size
|
==== shard_size
|
||||||
|
@ -72,83 +68,9 @@ Response:
|
||||||
The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
|
The `shard_size` parameter limits how many top-scoring documents are collected in the sample processed on each shard.
|
||||||
The default value is 100.
|
The default value is 100.
|
||||||
|
|
||||||
==== Controlling diversity
|
|
||||||
Optionally, you can use the `field` or `script` and `max_docs_per_value` settings to control the maximum number of documents collected on any one shard which share a common value.
|
|
||||||
The choice of value (e.g. `author`) is loaded from a regular `field` or derived dynamically by a `script`.
|
|
||||||
|
|
||||||
The aggregation will throw an error if the choice of field or script produces multiple values for a document.
|
|
||||||
It is currently not possible to offer this form of de-duplication using many values, primarily due to concerns over efficiency.
|
|
||||||
|
|
||||||
NOTE: Any good market researcher will tell you that when working with samples of data it is important
|
|
||||||
that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
|
|
||||||
The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, a large spike in a timeline or an over-active forum spammer).
|
|
||||||
|
|
||||||
==== Field
|
|
||||||
|
|
||||||
Controlling diversity using a field:
|
|
||||||
|
|
||||||
[source,js]
|
|
||||||
--------------------------------------------------
|
|
||||||
{
|
|
||||||
"aggs" : {
|
|
||||||
"sample" : {
|
|
||||||
"sampler" : {
|
|
||||||
"field" : "author",
|
|
||||||
"max_docs_per_value" : 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
--------------------------------------------------
|
|
||||||
|
|
||||||
Note that the `max_docs_per_value` setting applies on a per-shard basis only for the purposes of shard-local sampling.
|
|
||||||
It is not intended as a way of providing a global de-duplication feature on search results.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
==== Script
|
|
||||||
|
|
||||||
Controlling diversity using a script:
|
|
||||||
|
|
||||||
[source,js]
|
|
||||||
--------------------------------------------------
|
|
||||||
{
|
|
||||||
"aggs" : {
|
|
||||||
"sample" : {
|
|
||||||
"sampler" : {
|
|
||||||
"script" : "doc['author'].value + '/' + doc['genre'].value"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
--------------------------------------------------
|
|
||||||
Note in the above example we chose to use the default `max_docs_per_value` setting of 1 and combine author and genre fields to ensure
|
|
||||||
each shard sample has, at most, one match for an author/genre pair.
|
|
||||||
|
|
||||||
|
|
||||||
==== execution_hint
|
|
||||||
|
|
||||||
When using the settings to control diversity, the optional `execution_hint` setting can influence the management of the values used for de-duplication.
|
|
||||||
Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows:
|
|
||||||
|
|
||||||
- hold field values directly (`map`)
|
|
||||||
- hold ordinals of the field as determined by the Lucene index (`global_ordinals`)
|
|
||||||
- hold hashes of the field values - with potential for hash collisions (`bytes_hash`)
|
|
||||||
|
|
||||||
The default setting is to use `global_ordinals` if this information is available from the Lucene index and reverting to `map` if not.
|
|
||||||
The `bytes_hash` setting may prove faster in some cases but introduces the possibility of false positives in de-duplication logic due to the possibility of hash collisions.
|
|
||||||
Please note that Elasticsearch will ignore the choice of execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints.
|
|
||||||
|
|
||||||
==== Limitations
|
==== Limitations
|
||||||
|
|
||||||
===== Cannot be nested under `breadth_first` aggregations
|
===== Cannot be nested under `breadth_first` aggregations
|
||||||
Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document.
|
Being a quality-based filter the sampler aggregation needs access to the relevance score produced for each document.
|
||||||
It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores.
|
It therefore cannot be nested under a `terms` aggregation which has the `collect_mode` switched from the default `depth_first` mode to `breadth_first` as this discards scores.
|
||||||
In this situation an error will be thrown.
|
In this situation an error will be thrown.
|
||||||
|
|
||||||
===== Limited de-dup logic.
|
|
||||||
The de-duplication logic in the diversify settings applies only at a shard level so will not apply across shards.
|
|
||||||
|
|
||||||
===== No specialized syntax for geo/date fields
|
|
||||||
Currently the syntax for defining the diversifying values is defined by a choice of `field` or `script` - there is no added syntactical sugar for expressing geo or date units such as "1w" (1 week).
|
|
||||||
This support may be added in a later release and users will currently have to create these sorts of values using a script.
|
|
Loading…
Reference in New Issue