Remove size 0 options in aggregations

This removes the ability to set `size: 0` in the `terms`, `significant_terms` and `geohash_grid` aggregations for the reasons described in https://github.com/elastic/elasticsearch/issues/18838

Closes #18838
This commit is contained in:
Colin Goodheart-Smithe 2016-06-14 11:37:10 +01:00
parent f8738c853b
commit cfd3356ee3
17 changed files with 122 additions and 242 deletions

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.aggregations.bucket.geogrid;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedNumericDocValues;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.geo.GeoHashUtils;
import org.elasticsearch.common.geo.GeoPoint;
@ -84,9 +85,9 @@ public class GeoGridAggregationBuilder extends ValuesSourceAggregationBuilder<Va
}
public GeoGridAggregationBuilder size(int size) {
if (size < -1) {
if (size <= 0) {
throw new IllegalArgumentException(
"[size] must be greater than or equal to 0. Found [" + shardSize + "] in [" + name + "]");
"[size] must be greater than 0. Found [" + size + "] in [" + name + "]");
}
this.requiredSize = size;
return this;
@ -97,9 +98,9 @@ public class GeoGridAggregationBuilder extends ValuesSourceAggregationBuilder<Va
}
public GeoGridAggregationBuilder shardSize(int shardSize) {
if (shardSize < -1) {
if (shardSize < -1 || shardSize == 0) {
throw new IllegalArgumentException(
"[shardSize] must be greater than or equal to 0. Found [" + shardSize + "] in [" + name + "]");
"[shardSize] must be greater than 0. Found [" + shardSize + "] in [" + name + "]");
}
this.shardSize = shardSize;
return this;
@ -114,20 +115,20 @@ public class GeoGridAggregationBuilder extends ValuesSourceAggregationBuilder<Va
ValuesSourceConfig<ValuesSource.GeoPoint> config, AggregatorFactory<?> parent, Builder subFactoriesBuilder)
throws IOException {
int shardSize = this.shardSize;
if (shardSize == 0) {
shardSize = Integer.MAX_VALUE;
}
int requiredSize = this.requiredSize;
if (requiredSize == 0) {
requiredSize = Integer.MAX_VALUE;
}
if (shardSize < 0) {
// Use default heuristic to avoid any wrong-ranking caused by distributed counting
// Use default heuristic to avoid any wrong-ranking caused by
// distributed counting
shardSize = BucketUtils.suggestShardSideQueueSize(requiredSize, context.searchContext().numberOfShards());
}
if (requiredSize <= 0 || shardSize <= 0) {
throw new ElasticsearchException(
"parameters [required_size] and [shard_size] must be >0 in geohash_grid aggregation [" + name + "].");
}
if (shardSize < requiredSize) {
shardSize = requiredSize;
}

View File

@ -83,13 +83,6 @@ public abstract class TermsAggregator extends BucketsAggregator {
public void ensureValidity() {
if (shardSize == 0) {
setShardSize(Integer.MAX_VALUE);
}
if (requiredSize == 0) {
setRequiredSize(Integer.MAX_VALUE);
}
// shard_size cannot be smaller than size as we need to at least fetch <size> entries from every shards in order to return <size>
if (shardSize < requiredSize) {
setShardSize(requiredSize);
@ -100,8 +93,12 @@ public abstract class TermsAggregator extends BucketsAggregator {
setShardMinDocCount(minDocCount);
}
if (requiredSize < 0 || minDocCount < 0) {
throw new ElasticsearchException("parameters [requiredSize] and [minDocCount] must be >=0 in terms aggregation.");
if (requiredSize <= 0 || shardSize <= 0) {
throw new ElasticsearchException("parameters [required_size] and [shard_size] must be >0 in terms aggregation.");
}
if (minDocCount < 0 || shardMinDocCount < 0) {
throw new ElasticsearchException("parameter [min_doc_count] and [shardMinDocCount] must be >=0 in terms aggregation.");
}
}

View File

@ -134,10 +134,9 @@ public class ChildrenIT extends ESIntegTestCase {
SearchResponse searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("randomized", true))
.addAggregation(
terms("category").field("category").size(0).subAggregation(
children("to_comment", "comment")
terms("category").field("category").size(10000).subAggregation(children("to_comment", "comment")
.subAggregation(
terms("commenters").field("commenter").size(0).subAggregation(
terms("commenters").field("commenter").size(10000).subAggregation(
topHits("top_comments")
))
)
@ -176,7 +175,7 @@ children("to_comment", "comment")
SearchResponse searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("randomized", false))
.addAggregation(
terms("category").field("category").size(0).subAggregation(
terms("category").field("category").size(10000).subAggregation(
children("to_comment", "comment").subAggregation(topHits("top_comments").sort("_uid", SortOrder.ASC))
)
).get();

View File

@ -21,6 +21,7 @@ package org.elasticsearch.search.aggregations.bucket;
import com.carrotsearch.hppc.ObjectIntHashMap;
import com.carrotsearch.hppc.ObjectIntMap;
import com.carrotsearch.hppc.cursors.ObjectIntCursor;
import org.elasticsearch.Version;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchResponse;
@ -52,8 +53,8 @@ import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.search.aggregations.AggregationBuilders.geohashGrid;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
@ESIntegTestCase.SuiteScopeTestCase
public class GeoHashGridIT extends ESIntegTestCase {
@ -305,24 +306,24 @@ public class GeoHashGridIT extends ESIntegTestCase {
}
}
// making sure this doesn't runs into an OOME
public void testSizeIsZero() {
for (int precision = 1; precision <= PRECISION; precision++) {
final int size = randomBoolean() ? 0 : randomIntBetween(1, Integer.MAX_VALUE);
final int shardSize = randomBoolean() ? -1 : 0;
SearchResponse response = client().prepareSearch("idx")
.addAggregation(geohashGrid("geohashgrid")
.field("location")
.size(size)
.shardSize(shardSize)
.precision(precision)
)
.execute().actionGet();
final int size = 0;
final int shardSize = 10000;
IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
() -> client().prepareSearch("idx")
.addAggregation(geohashGrid("geohashgrid").field("location").size(size).shardSize(shardSize)).execute()
.actionGet());
assertThat(exception.getMessage(), containsString("[size] must be greater than 0. Found [0] in [geohashgrid]"));
}
assertSearchResponse(response);
GeoHashGrid geoGrid = response.getAggregations().get("geohashgrid");
assertThat(geoGrid.getBuckets().size(), greaterThanOrEqualTo(1));
}
public void testShardSizeIsZero() {
final int size = 100;
final int shardSize = 0;
IllegalArgumentException exception = expectThrows(IllegalArgumentException.class,
() -> client().prepareSearch("idx")
.addAggregation(geohashGrid("geohashgrid").field("location").size(size).shardSize(shardSize))
.execute().actionGet());
assertThat(exception.getMessage(), containsString("[shardSize] must be greater than 0. Found [0] in [geohashgrid]"));
}
}

View File

@ -33,33 +33,10 @@ public class GeoHashGridTests extends BaseAggregationTestCase<GeoGridAggregation
factory.precision(precision);
}
if (randomBoolean()) {
int size = randomInt(5);
switch (size) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
size = randomIntBetween(0, Integer.MAX_VALUE);
break;
}
factory.size(size);
factory.size(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int shardSize = randomInt(5);
switch (shardSize) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
shardSize = randomIntBetween(0, Integer.MAX_VALUE);
break;
}
factory.shardSize(shardSize);
factory.shardSize(randomIntBetween(1, Integer.MAX_VALUE));
}
return factory;
}

View File

@ -353,7 +353,7 @@ public class ReverseNestedIT extends ESIntegTestCase {
.subAggregation(
terms("field2").field("nested1.nested2.field2").order(Terms.Order.term(true))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.size(0)
.size(10000)
.subAggregation(
reverseNested("nested1_to_field1").path("nested1")
.subAggregation(

View File

@ -73,37 +73,11 @@ public class SignificantTermsTests extends BaseAggregationTestCase<SignificantTe
factory.missing("MISSING");
}
if (randomBoolean()) {
int size = randomInt(4);
switch (size) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
size = randomInt();
break;
default:
fail();
}
factory.bucketCountThresholds().setRequiredSize(size);
factory.bucketCountThresholds().setRequiredSize(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int shardSize = randomInt(4);
switch (shardSize) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
shardSize = randomInt();
break;
default:
fail();
}
factory.bucketCountThresholds().setShardSize(shardSize);
factory.bucketCountThresholds().setShardSize(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int minDocCount = randomInt(4);

View File

@ -202,8 +202,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -232,8 +231,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -281,8 +279,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.count(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -313,8 +310,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -345,8 +341,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(false))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -377,8 +372,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", true))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(LONG_FIELD_NAME)))
@ -411,8 +405,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(STRING_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", false))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(LONG_FIELD_NAME)))
@ -445,8 +438,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -475,8 +467,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -524,8 +515,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.count(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -556,8 +546,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -588,8 +577,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(false))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -620,8 +608,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", true))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(LONG_FIELD_NAME)))
@ -654,8 +641,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(LONG_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", false))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(DOUBLE_FIELD_NAME)))
@ -688,8 +674,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -718,8 +703,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -767,8 +751,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.count(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -799,8 +782,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(true))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -831,8 +813,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.term(false))
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
@ -863,8 +844,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", true))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(LONG_FIELD_NAME)))
@ -897,8 +877,7 @@ public class TermsDocCountErrorIT extends ESIntegTestCase {
.executionHint(randomExecutionHint())
.field(DOUBLE_FIELD_NAME)
.showTermDocCountError(true)
.size(0)
.shardSize(0)
.size(10000).shardSize(10000)
.order(Order.aggregation("sortAgg", false))
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(sum("sortAgg").field(LONG_FIELD_NAME)))

View File

@ -69,37 +69,10 @@ public class TermsTests extends BaseAggregationTestCase<TermsAggregationBuilder>
factory.missing("MISSING");
}
if (randomBoolean()) {
int size = randomInt(4);
switch (size) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
size = randomInt();
break;
default:
fail();
}
factory.bucketCountThresholds().setRequiredSize(size);
factory.bucketCountThresholds().setRequiredSize(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int shardSize = randomInt(4);
switch (shardSize) {
case 0:
break;
case 1:
case 2:
case 3:
case 4:
shardSize = randomInt();
break;
default:
fail();
}
factory.bucketCountThresholds().setShardSize(shardSize);
factory.bucketCountThresholds().setShardSize(randomIntBetween(1, Integer.MAX_VALUE));
}
if (randomBoolean()) {
int minDocCount = randomInt(4);

View File

@ -117,15 +117,9 @@ precision:: Optional. The string length of the geohashes used to define
size:: Optional. The maximum number of geohash buckets to return
(defaults to 10,000). When results are trimmed, buckets are
prioritised based on the volumes of documents they contain.
A value of `0` will return all buckets that
contain a hit, use with caution as this could use a lot of CPU
and network bandwidth if there are many buckets.
shard_size:: Optional. To allow for more accurate counting of the top cells
returned in the final result the aggregation defaults to
returning `max(10,(size x number-of-shards))` buckets from each
shard. If this heuristic is undesirable, the number considered
from each shard can be over-ridden using this parameter.
A value of `0` makes the shard size unlimited.

View File

@ -224,12 +224,12 @@ are presented unstemmed, highlighted, with the right case, in the right order an
==== Custom background sets
Ordinarily, the foreground set of documents is "diffed" against a background set of all the documents in your index.
However, sometimes it may prove useful to use a narrower background set as the basis for comparisons.
For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish"
was a significant term. This may be true but if you want some more focused terms you could use a `background_filter`
on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now
be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.
Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.
However, sometimes it may prove useful to use a narrower background set as the basis for comparisons.
For example, a query on documents relating to "Madrid" in an index with content from all over the world might reveal that "Spanish"
was a significant term. This may be true but if you want some more focused terms you could use a `background_filter`
on the term 'spain' to establish a narrower set of documents as context. With this as a background "Spanish" would now
be seen as commonplace and therefore not as significant as words like "capital" that relate more strongly with Madrid.
Note that using a background filter will slow things down - each term's background frequency must now be derived on-the-fly from filtering posting lists rather than reading the index's pre-computed count for a term.
==== Limitations
@ -274,7 +274,7 @@ The scores are derived from the doc frequencies in _foreground_ and _background_
===== mutual information
Mutual information as described in "Information Retrieval", Manning et al., Chapter 13.5.1 can be used as significance score by adding the parameter
[source,js]
--------------------------------------------------
@ -283,9 +283,9 @@ Mutual information as described in "Information Retrieval", Manning et al., Chap
}
--------------------------------------------------
Mutual information does not differentiate between terms that are descriptive for the subset or for documents outside the subset. The significant terms therefore can contain terms that appear more or less frequent in the subset than outside the subset. To filter out the terms that appear less often in the subset than in documents outside the subset, `include_negatives` can be set to `false`.
Mutual information does not differentiate between terms that are descriptive for the subset or for documents outside the subset. The significant terms therefore can contain terms that appear more or less frequent in the subset than outside the subset. To filter out the terms that appear less often in the subset than in documents outside the subset, `include_negatives` can be set to `false`.
Per default, the assumption is that the documents in the bucket are also contained in the background. If instead you defined a custom background filter that represents a different set of documents that you want to compare to, set
Per default, the assumption is that the documents in the bucket are also contained in the background. If instead you defined a custom background filter that represents a different set of documents that you want to compare to, set
[source,js]
--------------------------------------------------
@ -296,7 +296,7 @@ Per default, the assumption is that the documents in the bucket are also contain
===== Chi square
Chi square as described in "Information Retrieval", Manning et al., Chapter 13.5.2 can be used as significance score by adding the parameter
[source,js]
--------------------------------------------------
@ -309,7 +309,7 @@ Chi square behaves like mutual information and can be configured with the same p
===== google normalized distance
Google normalized distance as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 (http://arxiv.org/pdf/cs/0412098v3.pdf) can be used as significance score by adding the parameter
[source,js]
--------------------------------------------------
@ -317,7 +317,7 @@ Google normalized distance as described in "The Google Similarity Distance", Ci
}
--------------------------------------------------
`gnd` also accepts the `background_is_superset` parameter.
`gnd` also accepts the `background_is_superset` parameter.
===== Percentage
@ -328,7 +328,7 @@ The benefit of this heuristic is that the scoring logic is simple to explain to
It would be hard for a seasoned boxer to win a championship if the prize was awarded purely on the basis of percentage of fights won - by these rules a newcomer with only one fight under his belt would be impossible to beat.
Multiple observations are typically required to reinforce a view so it is recommended in these cases to set both `min_doc_count` and `shard_min_doc_count` to a higher value such as 10 in order to filter out the low-frequency terms that otherwise take precedence.
[source,js]
--------------------------------------------------
@ -348,7 +348,7 @@ If none of the above measures suits your usecase than another option is to imple
===== scripted
Customized scores can be implemented via a script:
[source,js]
--------------------------------------------------
@ -357,7 +357,7 @@ Customized scores can be implemented via a script:
}
--------------------------------------------------
Scripts can be inline (as in above example), indexed or stored on disk. For details on the options, see <<modules-scripting, script documentation>>.
Scripts can be inline (as in above example), indexed or stored on disk. For details on the options, see <<modules-scripting, script documentation>>.
Available parameters in the script are
@ -374,9 +374,7 @@ default, the node coordinating the search process will request each shard to pro
and once all shards respond, it will reduce the results to the final list that will then be returned to the client.
If the number of unique terms is greater than `size`, the returned list can be slightly off and not accurate
(it could be that the term counts are slightly off and it could even be that a term that should have been in the top
size buckets was not returned).
If set to `0`, the `size` will be set to `Integer.MAX_VALUE`.
size buckets was not returned).
To ensure better accuracy a multiple of the final `size` is used as the number of terms to request from each shard
using a heuristic based on the number of shards. To take manual control of this setting the `shard_size` parameter
@ -386,12 +384,9 @@ Low-frequency terms can turn out to be the most interesting ones once all result
significant_terms aggregation can produce higher-quality results when the `shard_size` parameter is set to
values significantly higher than the `size` setting. This ensures that a bigger volume of promising candidate terms are given
a consolidated review by the reducing node before the final selection. Obviously large candidate term lists
will cause extra network traffic and RAM usage so this is quality/cost trade off that needs to be balanced. If `shard_size` is set to -1 (the default) then `shard_size` will be automatically estimated based on the number of shards and the `size` parameter.
will cause extra network traffic and RAM usage so this is quality/cost trade off that needs to be balanced. If `shard_size` is set to -1 (the default) then `shard_size` will be automatically estimated based on the number of shards and the `size` parameter.
If set to `0`, the `shard_size` will be set to `Integer.MAX_VALUE`.
NOTE: `shard_size` cannot be smaller than `size` (as it doesn't make much sense). When it is, elasticsearch will
override it and reset it to be equal to `size`.
@ -439,7 +434,7 @@ WARNING: Setting `min_doc_count` to `1` is generally not advised as it tends to
The default source of statistical information for background term frequencies is the entire index and this
scope can be narrowed through the use of a `background_filter` to focus in on significant terms within a narrower
context:
context:
[source,js]
--------------------------------------------------
@ -449,7 +444,7 @@ context:
},
"aggs" : {
"tags" : {
"significant_terms" : {
"significant_terms" : {
"field" : "tag",
"background_filter": {
"term" : { "text" : "spain"}
@ -460,9 +455,9 @@ context:
}
--------------------------------------------------
The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing
terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the
word "Spain".
The above filter would help focus in on terms that were peculiar to the city of Madrid rather than revealing
terms like "Spanish" that are unusual in the full index's worldwide context but commonplace in the subset of documents containing the
word "Spain".
WARNING: Use of background filters will slow the query as each term's postings must be filtered to determine a frequency
@ -482,7 +477,7 @@ There are different mechanisms by which terms aggregations can be executed:
- by using field values directly in order to aggregate data per-bucket (`map`)
- by using ordinals of the field and preemptively allocating one bucket per ordinal value (`global_ordinals`)
- by using ordinals of the field and dynamically allocating one bucket per ordinal value (`global_ordinals_hash`)
Elasticsearch tries to have sensible defaults so this is something that generally doesn't need to be configured.
`map` should only be considered when very few documents match a query. Otherwise the ordinals-based execution modes
@ -514,4 +509,3 @@ in inner aggregations.
<1> the possible values are `map`, `global_ordinals` and `global_ordinals_hash`
Please note that Elasticsearch will ignore this execution hint if it is not applicable.

View File

@ -56,7 +56,7 @@ default, the node coordinating the search process will request each shard to pro
and once all shards respond, it will reduce the results to the final list that will then be returned to the client.
This means that if the number of unique terms is greater than `size`, the returned list is slightly off and not accurate
(it could be that the term counts are slightly off and it could even be that a term that should have been in the top
size buckets was not returned). If set to `0`, the `size` will be set to `Integer.MAX_VALUE`.
size buckets was not returned).
[[search-aggregations-bucket-terms-aggregation-approximate-counts]]
==== Document counts are approximate
@ -149,14 +149,12 @@ The `shard_size` parameter can be used to minimize the extra work that comes wi
it will determine how many terms the coordinating node will request from each shard. Once all the shards responded, the
coordinating node will then reduce them to a final result which will be based on the `size` parameter - this way,
one can increase the accuracy of the returned terms and avoid the overhead of streaming a big list of buckets back to
the client. If set to `0`, the `shard_size` will be set to `Integer.MAX_VALUE`.
the client.
NOTE: `shard_size` cannot be smaller than `size` (as it doesn't make much sense). When it is, elasticsearch will
override it and reset it to be equal to `size`.
It is possible to not limit the number of terms that are returned by setting `size` to `0`. Don't use this
on high-cardinality fields as this will kill both your CPU since terms need to be return sorted, and your network.
The default `shard_size` is a multiple of the `size` parameter which is dependant on the number of shards.
@ -443,7 +441,7 @@ Generating the terms using a script:
"aggs" : {
"genders" : {
"terms" : {
"script" : {
"script" : {
"inline": "doc['gender'].value"
"lang": "painless"
}
@ -485,9 +483,9 @@ TIP: for indexed scripts replace the `file` parameter with an `id` parameter.
"genders" : {
"terms" : {
"field" : "gender",
"script" : {
"script" : {
"inline" : "'Gender: ' +_value"
"lang" : "painless"
"lang" : "painless"
}
}
}
@ -710,4 +708,4 @@ had a value.
}
--------------------------------------------------
<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`.
<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`.

View File

@ -20,3 +20,9 @@ Now that Elasticsearch supports `ipv6`, `ip` addresses are encoded in the index
using a binary representation rather than a numeric representation. As a
consequence, the output of `ip_range` aggregations does not give numeric values
for `from` and `to` anymore.
==== `size: 0` on Terms, Significant Terms and Geohash Grid Aggregations
`size: 0` is no longer valid for the terms, significant terms and geohash grid
aggregations. Instead a size should be explicitly specified with a number greater
than zero.

View File

@ -63,6 +63,7 @@ import static org.elasticsearch.search.aggregations.AggregationBuilders.sum;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.core.IsNull.notNullValue;
@ -235,20 +236,13 @@ public class DoubleTermsTests extends AbstractTermsTestCase {
// the main purpose of this test is to make sure we're not allocating 2GB of memory per shard
public void testSizeIsZero() {
SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.field(SINGLE_VALUED_FIELD_NAME)
.minDocCount(randomInt(1))
.size(0)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(100));
ElasticsearchException exception = expectThrows(ElasticsearchException.class,
() -> client()
.prepareSearch("idx").setTypes("high_card_type").addAggregation(terms("terms").field(SINGLE_VALUED_FIELD_NAME)
.minDocCount(randomInt(1)).size(0).collectMode(randomFrom(SubAggCollectionMode.values())))
.execute().actionGet());
assertThat(exception.getDetailedMessage(),
containsString("parameters [required_size] and [shard_size] must be >0 in terms aggregation."));
}
public void testSingleValueField() throws Exception {

View File

@ -61,6 +61,7 @@ import static org.elasticsearch.search.aggregations.AggregationBuilders.sum;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.core.IsNull.notNullValue;
@ -237,20 +238,16 @@ public class LongTermsTests extends AbstractTermsTestCase {
// the main purpose of this test is to make sure we're not allocating 2GB of memory per shard
public void testSizeIsZero() {
SearchResponse response = client().prepareSearch("idx").setTypes("high_card_type")
ElasticsearchException exception = expectThrows(ElasticsearchException.class,
() -> client().prepareSearch("idx").setTypes("high_card_type")
.addAggregation(terms("terms")
.field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values()))
.minDocCount(randomInt(1))
.size(0))
.execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(100));
.execute().actionGet());
assertThat(exception.getDetailedMessage(),
containsString("parameters [required_size] and [shard_size] must be >0 in terms aggregation."));
}
public void testSingleValueField() throws Exception {

View File

@ -70,6 +70,7 @@ import static org.elasticsearch.search.aggregations.AggregationBuilders.sum;
import static org.elasticsearch.search.aggregations.AggregationBuilders.terms;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.core.IsNull.notNullValue;
@ -202,20 +203,15 @@ public class StringTermsTests extends AbstractTermsTestCase {
// the main purpose of this test is to make sure we're not allocating 2GB of memory per shard
public void testSizeIsZero() {
final int minDocCount = randomInt(1);
SearchResponse response = client()
ElasticsearchException exception = expectThrows(ElasticsearchException.class, () -> client()
.prepareSearch("idx")
.setTypes("high_card_type")
.addAggregation(
terms("terms").executionHint(randomExecutionHint()).field(SINGLE_VALUED_FIELD_NAME)
.collectMode(randomFrom(SubAggCollectionMode.values())).minDocCount(minDocCount).size(0)).execute()
.actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
assertThat(terms, notNullValue());
assertThat(terms.getName(), equalTo("terms"));
assertThat(terms.getBuckets().size(), equalTo(minDocCount == 0 ? 105 : 100)); // 105 because of the other type
.actionGet());
assertThat(exception.getDetailedMessage(),
containsString("parameters [required_size] and [shard_size] must be >0 in terms aggregation."));
}
public void testSingleValueField() throws Exception {

View File

@ -48,7 +48,7 @@ public abstract class AbstractTermsTestCase extends ESIntegTestCase {
.addAggregation(terms("terms")
.executionHint(randomExecutionHint())
.field(fieldName)
.size(0)
.size(10000)
.collectMode(randomFrom(SubAggCollectionMode.values())))
.get();
assertSearchResponse(allTerms);