Fix HllSketchHolderObjectStrategy#isSafeToConvertToNullSketch. (#15860)

* Fix HllSketchHolderObjectStrategy#isSafeToConvertToNullSketch.

The prior code from #15162 was reading only the low-order byte of an int
representing the size of a coupon set. As a result, it would erroneously
believe that a coupon set with a multiple of 256 elements was empty.
This commit is contained in:
Gian Merlino 2024-02-07 18:44:28 -08:00 committed by GitHub
parent 514b3b4d01
commit 21a97f4c61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 79 additions and 5 deletions

View File

@ -85,7 +85,7 @@ public class HllSketchHolderObjectStrategy implements ObjectStrategy<HllSketchHo
* Checks the initial 8 byte header to find the type of internal sketch implementation, then uses the logic the * Checks the initial 8 byte header to find the type of internal sketch implementation, then uses the logic the
* corresponding implementation uses to tell if a sketch is empty while deserializing it. * corresponding implementation uses to tell if a sketch is empty while deserializing it.
*/ */
private static boolean isSafeToConvertToNullSketch(ByteBuffer buf, int size) static boolean isSafeToConvertToNullSketch(ByteBuffer buf, int size)
{ {
if (size < 8) { if (size < 8) {
// Sanity check. // Sanity check.
@ -110,13 +110,14 @@ public class HllSketchHolderObjectStrategy implements ObjectStrategy<HllSketchHo
int listCount = buf.get(position + 6) & 0xFF; // get(LIST_COUNT_BYTE) & 0xFF int listCount = buf.get(position + 6) & 0xFF; // get(LIST_COUNT_BYTE) & 0xFF
return listCount == 0; return listCount == 0;
case 1: // SET case 1: // SET
if (preInts != 3 || size < 9) { if (preInts != 3 || size < 12) {
// preInts should be HASH_SET_PREINTS, Sanity check. // preInts should be HASH_SET_PREINTS, Sanity check.
// We also need to read an additional byte for Set implementations. // We also need to read an additional int for Set implementations.
return false; return false;
} }
// Based on org.apache.datasketches.hll.PreambleUtil.extractHashSetCount // Based on org.apache.datasketches.hll.PreambleUtil.extractHashSetCount
int setCount = buf.get(position + 8); // get(HASH_SET_COUNT_INT) // Endianness of buf doesn't matter, since we're checking for equality with zero.
int setCount = buf.getInt(position + 8); // getInt(HASH_SET_COUNT_INT)
return setCount == 0; return setCount == 0;
case 2: // HLL case 2: // HLL
if (preInts != 10) { if (preInts != 10) {

View File

@ -21,14 +21,16 @@ package org.apache.druid.query.aggregation.datasketches.hll;
import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.hll.HllSketch; import org.apache.datasketches.hll.HllSketch;
import org.apache.datasketches.hll.TgtHllType;
import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.StringUtils;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.ByteOrder; import java.nio.ByteOrder;
import java.util.Random;
public class HllSketchObjectStrategyTest public class HllSketchHolderObjectStrategyTest
{ {
@Test @Test
public void testSafeRead() public void testSafeRead()
@ -74,4 +76,75 @@ public class HllSketchObjectStrategyTest
() -> objectStrategy.fromByteBufferSafe(buf4, garbageLonger.length).getSketch().copy() () -> objectStrategy.fromByteBufferSafe(buf4, garbageLonger.length).getSketch().copy()
); );
} }
@Test
public void testHllSketchIsNullEquivalent()
{
final Random random = new Random(0);
for (final TgtHllType tgtHllType : TgtHllType.values()) {
for (int lgK = 7; lgK < 22; lgK++) {
for (int sz : new int[]{0, 1, 2, 127, 128, 129, 255, 256, 257, 511, 512, 513, 16383, 16384, 16385}) {
final String description = StringUtils.format("tgtHllType[%s], lgK[%s], sz[%s]", tgtHllType, lgK, sz);
final HllSketch sketch = new HllSketch(lgK, tgtHllType);
for (int i = 0; i < sz; i++) {
sketch.update(random.nextLong());
}
final boolean expectEmpty = sz == 0;
// --------------------------------
// Compact array, little endian buf
final byte[] compactBytes = sketch.toCompactByteArray();
// Add a byte of padding on either side
ByteBuffer buf = ByteBuffer.allocate(compactBytes.length + 2);
buf.order(ByteOrder.LITTLE_ENDIAN);
buf.position(1);
buf.put(compactBytes);
buf.position(1);
Assert.assertEquals(
"Compact array littleEndian " + description,
expectEmpty,
HllSketchHolderObjectStrategy.isSafeToConvertToNullSketch(buf, compactBytes.length)
);
Assert.assertEquals(1, buf.position());
// -----------------------------
// Compact array, big endian buf
buf.order(ByteOrder.BIG_ENDIAN);
Assert.assertEquals(
"Compact array bigEndian " + description,
expectEmpty,
HllSketchHolderObjectStrategy.isSafeToConvertToNullSketch(buf, compactBytes.length)
);
Assert.assertEquals(1, buf.position());
// ----------------------------------
// Updatable array, little endian buf
final byte[] updatableBytes = sketch.toUpdatableByteArray();
// Add a byte of padding on either side
buf = ByteBuffer.allocate(updatableBytes.length + 2);
buf.order(ByteOrder.LITTLE_ENDIAN);
buf.position(1);
buf.put(updatableBytes);
buf.position(1);
Assert.assertEquals(
"Updatable array littleEndian " + description,
expectEmpty,
HllSketchHolderObjectStrategy.isSafeToConvertToNullSketch(buf, updatableBytes.length)
);
Assert.assertEquals(1, buf.position());
// -------------------------------
// Updatable array, big endian buf
buf.order(ByteOrder.BIG_ENDIAN);
Assert.assertEquals(
"Updatable array bigEndian " + description,
expectEmpty,
HllSketchHolderObjectStrategy.isSafeToConvertToNullSketch(buf, updatableBytes.length)
);
Assert.assertEquals(1, buf.position());
}
}
}
}
} }