Fix an optimization in terms agg (backport #57438) (#57547)

When the `terms` agg runs against strings and uses global ordinals it
has an optimization when it collects segments that only ever have a
single value for the particular string. This is *very* common. But I
broke it in #57241. This fixes that optimization and adds `debug`
information that you can use to see how often we collect segments of
each type. And adds a test to make sure that I don't break the
optimization again.

We also had a specialiation for when there isn't a filter on the terms
to aggregate. I had removed that specialization in #57241 which resulted
in some slow down as well. This adds it back but in a more clear way.
And, hopefully, a way that is marginally faster when there *is* a
filter.

Closes #57407
This commit is contained in:
Nik Everett 2020-06-02 14:57:45 -04:00 committed by GitHub
parent e50f514092
commit 97c06816a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 151 additions and 68 deletions

View File

@ -779,7 +779,7 @@ setup:
body: { "size" : 0, "aggs" : { "no_field_terms" : { "terms" : { "size": 1 } } } }
---
"profiler":
"global ords profiler":
- skip:
version: " - 7.8.99"
reason: debug information added in 7.9.0
@ -808,6 +808,7 @@ setup:
terms:
field: str
collect_mode: breadth_first
execution_hint: global_ordinals
aggs:
max_number:
max:
@ -823,9 +824,83 @@ setup:
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 4 }
- match: { profile.shards.0.aggregations.0.debug.deferred_aggregators: [ max_number ] }
- match: { profile.shards.0.aggregations.0.debug.collection_strategy: dense }
- match: { profile.shards.0.aggregations.0.debug.result_strategy: terms }
- gt: { profile.shards.0.aggregations.0.debug.segments_with_single_valued_ords: 0 }
- match: { profile.shards.0.aggregations.0.debug.segments_with_multi_valued_ords: 0 }
- match: { profile.shards.0.aggregations.0.debug.has_filter: false }
- match: { profile.shards.0.aggregations.0.children.0.type: MaxAggregator }
- match: { profile.shards.0.aggregations.0.children.0.description: max_number }
- do:
indices.create:
index: test_3
body:
settings:
number_of_shards: 1
number_of_replicas: 0
mappings:
properties:
str:
type: keyword
- do:
bulk:
index: test_3
refresh: true
body: |
{ "index": {} }
{ "str": ["pig", "sheep"], "number": 100 }
- do:
search:
index: test_3
body:
profile: true
size: 0
aggs:
str_terms:
terms:
field: str
collect_mode: breadth_first
execution_hint: global_ordinals
aggs:
max_number:
max:
field: number
- match: { aggregations.str_terms.buckets.0.key: pig }
- match: { aggregations.str_terms.buckets.0.max_number.value: 100 }
- match: { aggregations.str_terms.buckets.1.key: sheep }
- match: { aggregations.str_terms.buckets.1.max_number.value: 100 }
- match: { profile.shards.0.aggregations.0.type: GlobalOrdinalsStringTermsAggregator }
- match: { profile.shards.0.aggregations.0.description: str_terms }
- match: { profile.shards.0.aggregations.0.breakdown.collect_count: 1 }
- match: { profile.shards.0.aggregations.0.debug.deferred_aggregators: [ max_number ] }
- match: { profile.shards.0.aggregations.0.debug.collection_strategy: dense }
- match: { profile.shards.0.aggregations.0.debug.result_strategy: terms }
- match: { profile.shards.0.aggregations.0.debug.segments_with_single_valued_ords: 0 }
- gt: { profile.shards.0.aggregations.0.debug.segments_with_multi_valued_ords: 0 }
- match: { profile.shards.0.aggregations.0.debug.has_filter: false }
- match: { profile.shards.0.aggregations.0.children.0.type: MaxAggregator }
- match: { profile.shards.0.aggregations.0.children.0.description: max_number }
---
"numeric profiler":
- skip:
version: " - 7.8.99"
reason: debug information added in 7.9.0
- do:
bulk:
index: test_1
refresh: true
body: |
{ "index": {} }
{ "number": 1 }
{ "index": {} }
{ "number": 3 }
{ "index": {} }
{ "number": 1 }
{ "index": {} }
{ "number": 1 }
- do:
search:
index: test_1

View File

@ -33,7 +33,6 @@ import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.IntArray;
import org.elasticsearch.common.util.LongHash;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.fielddata.AbstractSortedSetDocValues;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactories;
@ -73,6 +72,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
private final long valueCount;
private final GlobalOrdLookupFunction lookupGlobalOrd;
protected final CollectionStrategy collectionStrategy;
protected int segmentsWithSingleValuedOrds = 0;
protected int segmentsWithMultiValuedOrds = 0;
public interface GlobalOrdLookupFunction {
BytesRef apply(long ord) throws IOException;
@ -102,7 +103,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
valuesSource.globalOrdinalsValues(context.searcher().getIndexReader().leaves().get(0)) : DocValues.emptySortedSet();
this.valueCount = values.getValueCount();
this.lookupGlobalOrd = values::lookupOrd;
this.acceptedGlobalOrdinals = includeExclude == null ? l -> true : includeExclude.acceptedGlobalOrdinals(values)::get;
this.acceptedGlobalOrdinals = includeExclude == null ? ALWAYS_TRUE : includeExclude.acceptedGlobalOrdinals(values)::get;
this.collectionStrategy = remapGlobalOrds ? new RemapGlobalOrds() : new DenseGlobalOrds();
}
@ -110,24 +111,60 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
return collectionStrategy.describe();
}
private SortedSetDocValues getGlobalOrds(LeafReaderContext ctx) throws IOException {
return acceptedGlobalOrdinals == null ?
valuesSource.globalOrdinalsValues(ctx) : new FilteredOrdinals(valuesSource.globalOrdinalsValues(ctx), acceptedGlobalOrdinals);
}
@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
SortedSetDocValues globalOrds = getGlobalOrds(ctx);
SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx);
collectionStrategy.globalOrdsReady(globalOrds);
SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds);
if (singleValues != null) {
segmentsWithSingleValuedOrds++;
if (acceptedGlobalOrdinals == ALWAYS_TRUE) {
/*
* Optimize when there isn't a filter because that is very
* common and marginally faster.
*/
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, globalOrds) {
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (false == singleValues.advanceExact(doc)) {
return;
}
int globalOrd = singleValues.ordValue();
collectionStrategy.collectGlobalOrd(doc, globalOrd, sub);
}
});
}
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, globalOrds) {
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (singleValues.advanceExact(doc)) {
int ord = singleValues.ordValue();
collectionStrategy.collectGlobalOrd(doc, ord, sub);
if (false == singleValues.advanceExact(doc)) {
return;
}
int globalOrd = singleValues.ordValue();
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
return;
}
collectionStrategy.collectGlobalOrd(doc, globalOrd, sub);
}
});
}
segmentsWithMultiValuedOrds++;
if (acceptedGlobalOrdinals == ALWAYS_TRUE) {
/*
* Optimize when there isn't a filter because that is very
* common and marginally faster.
*/
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, globalOrds) {
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (false == globalOrds.advanceExact(doc)) {
return;
}
for (long globalOrd = globalOrds.nextOrd(); globalOrd != NO_MORE_ORDS; globalOrd = globalOrds.nextOrd()) {
collectionStrategy.collectGlobalOrd(doc, globalOrd, sub);
}
}
});
@ -136,10 +173,14 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (globalOrds.advanceExact(doc)) {
for (long globalOrd = globalOrds.nextOrd(); globalOrd != NO_MORE_ORDS; globalOrd = globalOrds.nextOrd()) {
collectionStrategy.collectGlobalOrd(doc, globalOrd, sub);
if (false == globalOrds.advanceExact(doc)) {
return;
}
for (long globalOrd = globalOrds.nextOrd(); globalOrd != NO_MORE_ORDS; globalOrd = globalOrds.nextOrd()) {
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
continue;
}
collectionStrategy.collectGlobalOrd(doc, globalOrd, sub);
}
}
});
@ -160,6 +201,9 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
super.collectDebugInfo(add);
add.accept("collection_strategy", collectionStrategy.describe());
add.accept("result_strategy", resultStrategy.describe());
add.accept("segments_with_single_valued_ords", segmentsWithSingleValuedOrds);
add.accept("segments_with_multi_valued_ords", segmentsWithMultiValuedOrds);
add.accept("has_filter", acceptedGlobalOrdinals != ALWAYS_TRUE);
}
/**
@ -253,26 +297,31 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
assert sub == LeafBucketCollector.NO_OP_COLLECTOR;
final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
mapping = valuesSource.globalOrdinalsMapping(ctx);
// Dense mode doesn't support include/exclude so we don't have to check it here.
if (singleValues != null) {
segmentsWithSingleValuedOrds++;
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, segmentOrds) {
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (singleValues.advanceExact(doc)) {
final int ord = singleValues.ordValue();
segmentDocCounts.increment(ord + 1, 1);
if (false == singleValues.advanceExact(doc)) {
return;
}
int ord = singleValues.ordValue();
segmentDocCounts.increment(ord + 1, 1);
}
});
}
segmentsWithMultiValuedOrds++;
return resultStrategy.wrapCollector(new LeafBucketCollectorBase(sub, segmentOrds) {
@Override
public void collect(int doc, long owningBucketOrd) throws IOException {
assert owningBucketOrd == 0;
if (segmentOrds.advanceExact(doc)) {
for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) {
segmentDocCounts.increment(segmentOrd + 1, 1);
}
if (false == segmentOrds.advanceExact(doc)) {
return;
}
for (long segmentOrd = segmentOrds.nextOrd(); segmentOrd != NO_MORE_ORDS; segmentOrd = segmentOrds.nextOrd()) {
segmentDocCounts.increment(segmentOrd + 1, 1);
}
}
});
@ -306,52 +355,6 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
}
}
private static final class FilteredOrdinals extends AbstractSortedSetDocValues {
private final SortedSetDocValues inner;
private final LongPredicate accepted;
private FilteredOrdinals(SortedSetDocValues inner, LongPredicate accepted) {
this.inner = inner;
this.accepted = accepted;
}
@Override
public long getValueCount() {
return inner.getValueCount();
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
return inner.lookupOrd(ord);
}
@Override
public long nextOrd() throws IOException {
for (long ord = inner.nextOrd(); ord != NO_MORE_ORDS; ord = inner.nextOrd()) {
if (accepted.test(ord)) {
return ord;
}
}
return NO_MORE_ORDS;
}
@Override
public boolean advanceExact(int target) throws IOException {
if (inner.advanceExact(target)) {
for (long ord = inner.nextOrd(); ord != NO_MORE_ORDS; ord = inner.nextOrd()) {
if (accepted.test(ord)) {
// reset the iterator
boolean advanced = inner.advanceExact(target);
assert advanced;
return true;
}
}
}
return false;
}
}
/**
* Strategy for collecting global ordinals.
* <p>
@ -800,4 +803,9 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
System.arraycopy(from.bytes, from.offset, to.bytes, 0, from.length);
}
}
/**
* Predicate used for {@link #acceptedGlobalOrdinals} if there is no filter.
*/
private static final LongPredicate ALWAYS_TRUE = l -> true;
}