Simplify BucketedSort (#53199) (#53240)

Our lovely `BitArray` compactly stores "flags", lazilly growing its
underlying storage. It is super useful when you need to store one bit of
data for a zillion buckets or a documents or something. Usefully, it
defaults to `false`. But there is a wrinkle! If you ask it whether or
not a bit is set but it hasn't grown its underlying storage array
"around" that index then it'll throw an `ArrayIndexOutOfBoundsException`.
The per-document use cases tend to show up in order and don't tend to
mind this too much. But the use case in aggregations, the per-bucket use
case, does. Because buckets are collected out of order all the time.

This changes `BitArray` so it'll return `false` if the index is too big
for the underlying storage. After all, that index *can't* have been set
or else we would have grown the underlying array. Logically, I believe
this makes sense. And it makes my life easy. At the cost of three lines.

*but* this adds an extra test to every call to `get`. I think this is
likely ok because it is "very close" to an array index lookup that
already runs the same test. So I *think* it'll end up merged with the
array bounds check.
This commit is contained in:
Nik Everett 2020-03-06 15:27:51 -05:00 committed by GitHub
parent d6813cb348
commit 7c9641ef9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 30 additions and 9 deletions

View File

@ -37,16 +37,32 @@ public final class BitArray implements Releasable {
this.bits = bigArrays.newLongArray(initialSize, true);
}
/**
* Set the {@code index}th bit.
*/
public void set(int index) {
fill(index, true);
}
/**
* Clear the {@code index}th bit.
*/
public void clear(int index) {
fill(index, false);
}
/**
* Is the {@code index}th bit set?
*/
public boolean get(int index) {
int wordNum = index >> 6;
if (wordNum >= bits.size()) {
/*
* If the word is bigger than the array then it could *never* have
* been set.
*/
return false;
}
long bitmask = 1L << index;
return (bits.get(wordNum) & bitmask) != 0;
}

View File

@ -134,13 +134,6 @@ public abstract class BucketedSort implements Releasable {
* it is still gathering.
*/
private final BitArray heapMode;
/**
* The highest bucket ordinal that has been converted into a heap. This is
* required because calling {@link BitArray#get(int)} on an index higher
* than the highest one that was {@link BitArray#set(int) set} could throw
* and {@link ArrayIndexOutOfBoundsException}. So we check this first.
*/
private long maxHeapBucket = 0;
protected BucketedSort(BigArrays bigArrays, SortOrder order, DocValueFormat format, int bucketSize, ExtraData extra) {
this.bigArrays = bigArrays;
@ -216,7 +209,7 @@ public abstract class BucketedSort implements Releasable {
* Is this bucket a min heap {@code true} or in gathering mode {@code false}?
*/
private boolean inHeapMode(long bucket) {
return bucket <= maxHeapBucket && heapMode.get((int) bucket);
return heapMode.get((int) bucket);
}
/**
@ -430,7 +423,6 @@ public abstract class BucketedSort implements Releasable {
throw new UnsupportedOperationException("Bucketed sort doesn't support more than [" + Integer.MAX_VALUE + "] buckets");
// BitArray needs int keys and this'd take a ton of memory to use that many buckets. So we just don't.
}
maxHeapBucket = Math.max(bucket, maxHeapBucket);
heapMode.set((int) bucket);
heapify(rootIndex);
} else {

View File

@ -50,4 +50,17 @@ public class BitArrayTests extends ESTestCase {
}
}
}
public void testTooBigIsNotSet() {
try (BitArray bitArray = new BitArray(1, BigArrays.NON_RECYCLING_INSTANCE)) {
for (int i = 0; i < 1000; i++) {
/*
* The first few times this is called we check within the
* array. But we quickly go beyond it and those all return
* false as well.
*/
assertFalse(bitArray.get(i));
}
}
}
}