Fix bug with terms' min_doc_count (#62130) (#62177)

The `global_ordinals` implementation of `terms` had a bug when
`min_doc_count: 0` that'd cause sub-aggregations to have array index out
of bounds exceptions. Ooops. My fault. This fixes the bug by assigning
ordinals to those buckets.

Closes #62084
This commit is contained in:
Nik Everett 2020-09-09 13:04:51 -04:00 committed by GitHub
parent 885051fc14
commit 1104d65465
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 142 additions and 2 deletions

View File

@ -1001,3 +1001,130 @@ setup:
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 4 }
- match: { profile.shards.0.aggregations.0.debug.result_strategy: long_terms }
- match: { profile.shards.0.aggregations.0.debug.total_buckets: 2 }
---
"min_doc_count":
- skip:
version: " - 7.99.99"
reason: broken in 7.9.1, not yet backported
- do:
bulk:
index: test_1
refresh: true
body: |
{ "index": {} }
{ "str": "sheep", "number": 1 }
{ "index": {} }
{ "str": "sheep", "number": 3 }
{ "index": {} }
{ "str": "cow", "number": 1 }
{ "index": {} }
{ "str": "pig", "number": 1 }
- do:
search:
index: test_1
body:
size: 0
query:
simple_query_string:
fields: [str]
query: sheep cow
minimum_should_match: 1
aggs:
str_terms:
terms:
field: str
min_doc_count: 2
aggs:
max_number:
max:
field: number
- length: { aggregations.str_terms.buckets: 1 }
- match: { aggregations.str_terms.buckets.0.key: sheep }
- match: { aggregations.str_terms.buckets.0.doc_count: 2 }
- match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
- do:
search:
index: test_1
body:
size: 0
query:
simple_query_string:
fields: [str]
query: sheep cow
minimum_should_match: 1
aggs:
str_terms:
terms:
field: str
min_doc_count: 1
aggs:
max_number:
max:
field: number
- length: { aggregations.str_terms.buckets: 2 }
- match: { aggregations.str_terms.buckets.0.key: sheep }
- match: { aggregations.str_terms.buckets.0.doc_count: 2 }
- match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
- match: { aggregations.str_terms.buckets.1.key: cow }
- match: { aggregations.str_terms.buckets.1.doc_count: 1 }
- match: { aggregations.str_terms.buckets.1.max_number.value: 1 }
- do:
search:
index: test_1
body:
size: 0
query:
simple_query_string:
fields: [str]
query: sheep cow
minimum_should_match: 1
aggs:
str_terms:
terms:
field: str
aggs:
max_number:
max:
field: number
- length: { aggregations.str_terms.buckets: 2 }
- match: { aggregations.str_terms.buckets.0.key: sheep }
- match: { aggregations.str_terms.buckets.0.doc_count: 2 }
- match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
- match: { aggregations.str_terms.buckets.1.key: cow }
- match: { aggregations.str_terms.buckets.1.doc_count: 1 }
- match: { aggregations.str_terms.buckets.1.max_number.value: 1 }
- do:
search:
index: test_1
body:
size: 0
query:
simple_query_string:
fields: [str]
query: sheep cow
minimum_should_match: 1
aggs:
str_terms:
terms:
field: str
min_doc_count: 0
aggs:
max_number:
max:
field: number
- length: { aggregations.str_terms.buckets: 3 }
- match: { aggregations.str_terms.buckets.0.key: sheep }
- match: { aggregations.str_terms.buckets.0.doc_count: 2 }
- match: { aggregations.str_terms.buckets.0.max_number.value: 3 }
- match: { aggregations.str_terms.buckets.1.key: cow }
- match: { aggregations.str_terms.buckets.1.doc_count: 1 }
- match: { aggregations.str_terms.buckets.1.max_number.value: 1.0 }
- match: { aggregations.str_terms.buckets.2.key: pig }
- match: { aggregations.str_terms.buckets.2.doc_count: 0 }
- match: { aggregations.str_terms.buckets.2.max_number.value: null }

View File

@ -513,8 +513,21 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
continue;
}
long bucketOrd = bucketOrds.find(owningBucketOrd, globalOrd);
long docCount = bucketOrd < 0 ? 0 : bucketDocCount(bucketOrd);
/*
* Use `add` instead of `find` here to assign an ordinal
* even if the global ord wasn't found so we can build
* sub-aggregations without trouble even though we haven't
* hit any documents for them. This is wasteful, but
* settings minDocCount == 0 is wasteful in general.....
*/
long bucketOrd = bucketOrds.add(owningBucketOrd, globalOrd);
long docCount;
if (bucketOrd < 0) {
bucketOrd = -1 - bucketOrd;
docCount = bucketDocCount(bucketOrd);
} else {
docCount = 0;
}
consumer.accept(globalOrd, bucketOrd, docCount);
}
} else {