Add microbenchmark for LongKeyedBucketOrds (#58608) (#59459)

I've always been confused by the strange behavior that I saw when
working on #57304. Specifically, I saw switching from a bimorphic
invocation to a monomorphic invocation to give us a 7%-15% performance
bump. This felt *bonkers* to me. And, it also made me wonder whether
it'd be worth looking into doing it everywhere.

It turns out that, no, it isn't needed everywhere. This benchmark shows
that a bimorphic invocation like:
```
LongKeyedBucketOrds ords = new LongKeyedBucketOrds.ForSingle();
ords.add(0, 0); <------ this line
```

is 19% slower than a monomorphic invocation like:
```
LongKeyedBucketOrds.ForSingle ords = new LongKeyedBucketOrds.ForSingle();
ords.add(0, 0); <------ this line
```

But *only* when the reference is mutable. In the example above, if
`ords` is never changed then both perform the same. But if the `ords`
reference is assigned twice then we start to see the difference:
```
immutable bimorphic    avgt   10   6.468 ± 0.045  ns/op
immutable monomorphic  avgt   10   6.756 ± 0.026  ns/op
mutable   bimorphic    avgt   10   9.741 ± 0.073  ns/op
mutable   monomorphic  avgt   10   8.190 ± 0.016  ns/op
```

So the conclusion from all this is that we've done the right thing:
`auto_date_histogram` is the only aggregation in which `ords` isn't final
and it is the only aggregation that forces monomorphic invocations. All
other aggregations use an immutable bimorphic invocation. Which is fine.

Relates to #56487
This commit is contained in:
Nik Everett 2020-07-13 17:22:46 -04:00 committed by GitHub
parent db89764539
commit 81cba796e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 172 additions and 0 deletions

View File

@ -0,0 +1,172 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.benchmark.search.aggregations.bucket.terms;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.PageCacheRecycler;
import org.elasticsearch.search.aggregations.CardinalityUpperBound;
import org.elasticsearch.search.aggregations.bucket.histogram.AutoDateHistogramAggregationBuilder;
import org.elasticsearch.search.aggregations.bucket.terms.LongKeyedBucketOrds;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OperationsPerInvocation;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;
import java.util.concurrent.TimeUnit;
@Fork(2)
@Warmup(iterations = 10)
@Measurement(iterations = 5)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@OperationsPerInvocation(1_000_000)
@State(Scope.Benchmark)
public class LongKeyedBucketOrdsBenchmark {
private static final long LIMIT = 1_000_000;
/**
* The number of distinct values to add to the buckets.
*/
private static final long DISTINCT_VALUES = 10;
/**
* The number of buckets to create in the {@link #multiBucket} case.
* <p>
* If this is not relatively prime to {@link #DISTINCT_VALUES} then the
* values won't be scattered evenly across the buckets.
*/
private static final long DISTINCT_BUCKETS = 21;
private final PageCacheRecycler recycler = new PageCacheRecycler(Settings.EMPTY);
private final BigArrays bigArrays = new BigArrays(recycler, null, "REQUEST");
/**
* Force loading all of the implementations just for extra paranoia's sake.
* We really don't want the JVM to be able to eliminate one of them just
* because we don't use it in the particular benchmark. That is totally a
* thing it'd do. It is sneaky.
*/
@Setup
public void forceLoadClasses(Blackhole bh) {
bh.consume(LongKeyedBucketOrds.FromSingle.class);
bh.consume(LongKeyedBucketOrds.FromMany.class);
}
/**
* Emulates a way that we do <strong>not</strong> use {@link LongKeyedBucketOrds}
* because it is not needed.
*/
@Benchmark
public void singleBucketIntoSingleImmutableMonmorphicInvocation(Blackhole bh) {
try (LongKeyedBucketOrds.FromSingle ords = new LongKeyedBucketOrds.FromSingle(bigArrays)) {
for (long i = 0; i < LIMIT; i++) {
ords.add(0, i % DISTINCT_VALUES);
}
bh.consume(ords);
}
}
/**
* Emulates the way that most aggregations use {@link LongKeyedBucketOrds}.
*/
@Benchmark
public void singleBucketIntoSingleImmutableBimorphicInvocation(Blackhole bh) {
try (LongKeyedBucketOrds ords = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.ONE)) {
for (long i = 0; i < LIMIT; i++) {
ords.add(0, i % DISTINCT_VALUES);
}
bh.consume(ords);
}
}
/**
* Emulates the way that {@link AutoDateHistogramAggregationBuilder} uses {@link LongKeyedBucketOrds}.
*/
@Benchmark
public void singleBucketIntoSingleMutableMonmorphicInvocation(Blackhole bh) {
LongKeyedBucketOrds.FromSingle ords = new LongKeyedBucketOrds.FromSingle(bigArrays);
for (long i = 0; i < LIMIT; i++) {
if (i % 100_000 == 0) {
ords.close();
bh.consume(ords);
ords = new LongKeyedBucketOrds.FromSingle(bigArrays);
}
ords.add(0, i % DISTINCT_VALUES);
}
bh.consume(ords);
ords.close();
}
/**
* Emulates a way that we do <strong>not</strong> use {@link LongKeyedBucketOrds}
* because it is significantly slower than the
* {@link #singleBucketIntoSingleMutableMonmorphicInvocation monomorphic invocation}.
*/
@Benchmark
public void singleBucketIntoSingleMutableBimorphicInvocation(Blackhole bh) {
LongKeyedBucketOrds ords = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.ONE);
for (long i = 0; i < LIMIT; i++) {
if (i % 100_000 == 0) {
ords.close();
bh.consume(ords);
ords = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.ONE);
}
ords.add(0, i % DISTINCT_VALUES);
}
bh.consume(ords);
ords.close();
}
/**
* Emulates an aggregation that collects from a single bucket "by accident".
* This can happen if an aggregation is under, say, a {@code terms}
* aggregation and there is only a single value for that term in the index.
*/
@Benchmark
public void singleBucketIntoMulti(Blackhole bh) {
try (LongKeyedBucketOrds ords = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.MANY)) {
for (long i = 0; i < LIMIT; i++) {
ords.add(0, i % DISTINCT_VALUES);
}
bh.consume(ords);
}
}
/**
* Emulates an aggregation that collects from many buckets.
*/
@Benchmark
public void multiBucket(Blackhole bh) {
try (LongKeyedBucketOrds ords = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.MANY)) {
for (long i = 0; i < LIMIT; i++) {
ords.add(i % DISTINCT_BUCKETS, i % DISTINCT_VALUES);
}
bh.consume(ords);
}
}
}