mirror of https://github.com/apache/druid.git
Merge pull request #567 from metamx/hyperloglog-bugfixes
Fix edge cases in HLL folding
This commit is contained in:
commit
45508718c8
|
@ -299,8 +299,12 @@ public abstract class HyperLogLogCollector implements Comparable<HyperLogLogColl
|
|||
if (positionOf1 <= registerOffset) {
|
||||
return;
|
||||
} else if (positionOf1 > (registerOffset + range)) {
|
||||
byte currMax = getMaxOverflowValue();
|
||||
final byte currMax = getMaxOverflowValue();
|
||||
if (positionOf1 > currMax) {
|
||||
if(currMax <= (registerOffset + range)) {
|
||||
// this could be optimized by having an add without sanity checks
|
||||
add(getMaxOverflowRegister(), currMax);
|
||||
}
|
||||
setMaxOverflowValue(positionOf1);
|
||||
setMaxOverflowRegister(bucket);
|
||||
}
|
||||
|
@ -354,8 +358,6 @@ public abstract class HyperLogLogCollector implements Comparable<HyperLogLogColl
|
|||
throw new ISE("offsetDiff[%d] < 0, shouldn't happen because of swap.", offsetDiff);
|
||||
}
|
||||
|
||||
add(other.getMaxOverflowRegister(), other.getMaxOverflowValue());
|
||||
|
||||
final int myPayloadStart = getPayloadBytePosition();
|
||||
otherBuffer.position(other.getPayloadBytePosition());
|
||||
|
||||
|
@ -395,6 +397,9 @@ public abstract class HyperLogLogCollector implements Comparable<HyperLogLogColl
|
|||
// no need to call setRegisterOffset(myOffset) here, since it gets updated every time myOffset is incremented
|
||||
setNumNonZeroRegisters(numNonZero);
|
||||
|
||||
// this will add the max overflow and also recheck if offset needs to be shifted
|
||||
add(other.getMaxOverflowRegister(), other.getMaxOverflowValue());
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -570,18 +575,19 @@ public abstract class HyperLogLogCollector implements Comparable<HyperLogLogColl
|
|||
private short addNibbleRegister(short bucket, byte positionOf1)
|
||||
{
|
||||
short numNonZeroRegs = getNumNonZeroRegisters();
|
||||
final short position = (short) (bucket >> 1);
|
||||
|
||||
final int position = getPayloadBytePosition() + (short) (bucket >> 1);
|
||||
final boolean isUpperNibble = ((bucket & 0x1) == 0);
|
||||
|
||||
byte shiftedPositionOf1 = (isUpperNibble) ? (byte) (positionOf1 << bitsPerBucket) : positionOf1;
|
||||
final byte shiftedPositionOf1 = (isUpperNibble) ? (byte) (positionOf1 << bitsPerBucket) : positionOf1;
|
||||
|
||||
if (storageBuffer.remaining() != getNumBytesForDenseStorage()) {
|
||||
convertToDenseStorage();
|
||||
}
|
||||
|
||||
byte origVal = storageBuffer.get(getPayloadBytePosition() + position);
|
||||
byte newValueMask = (isUpperNibble) ? (byte) 0xf0 : (byte) 0x0f;
|
||||
byte originalValueMask = (byte) (newValueMask ^ 0xff);
|
||||
final byte origVal = storageBuffer.get(position);
|
||||
final byte newValueMask = (isUpperNibble) ? (byte) 0xf0 : (byte) 0x0f;
|
||||
final byte originalValueMask = (byte) (newValueMask ^ 0xff);
|
||||
|
||||
// if something was at zero, we have to increase the numNonZeroRegisters
|
||||
if ((origVal & newValueMask) == 0 && shiftedPositionOf1 != 0) {
|
||||
|
@ -589,7 +595,7 @@ public abstract class HyperLogLogCollector implements Comparable<HyperLogLogColl
|
|||
}
|
||||
|
||||
storageBuffer.put(
|
||||
getPayloadBytePosition() + position,
|
||||
position,
|
||||
(byte) (UnsignedBytes.max((byte) (origVal & newValueMask), shiftedPositionOf1) | (origVal & originalValueMask))
|
||||
);
|
||||
|
||||
|
|
|
@ -19,16 +19,23 @@
|
|||
|
||||
package io.druid.query.aggregation.hyperloglog;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Collections2;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
|
@ -810,6 +817,46 @@ public class HyperLogLogCollectorTest
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFoldOrder() throws Exception
|
||||
{
|
||||
final List<String> objects = Lists.newArrayList(
|
||||
"AQcH/xYEMXOjRTVSQ1NXVENEM1RTUlVTRDI1aEVnhkOjNUaCI2MkU2VVhVNkNyVTa4NEYkS0kjZYU1RDdEYzUjglNTUzVFM0NkU3ZFUjOVJCdlU0N2QjRDRUV1MyZjNmVDOUM2RVVFRzhnUzVXY1R1RHUnNziURUdmREM0VjVEQmU0aEInZYNzNZNVRFgzVFNolSJHNIQ3QklEZlNSNoNTJXpDk1dFWjJGNYNiQzQkZFNEYzc1NVhSczM2NmJDZlc3JJRCpVNiRlNEI3dmU1ZGI0Q1RCMhNFZEJDZDYyNFOCM3U0VmRlVlNIRVQ4VVw1djNDVURHVSaFU0VEY0U1JFNIVCYlVEJWM2NWU0eURDOjQ6YyNTYkZjNUVjR1ZDdnVkMzVHZFpjMzlmNEFHM0dHJlRYTHSEQjVZVVZkVVIzIjg2SUU0NSM0VFNDNCdGVlQkhBNENCVTZGZEVlxFQyQ0NYWkUmVUJUYzRlNqg4NVVTNThEJkRGNDNUNFSEYmgkR0dDR1JldCNhVEZGRENGc1NDRUNER3WJRTRHQ4JlOYZoJDVVVVMzZSREZ1Q1UjSHNkdUMlU0ODIzZThSNmNDNjQ1o2I0YiRGYyZkNUJYVEMyN2QpQyMkc2VTE4U2VCNHZFRDNTh0IzI2VFNTMlUkNGMlKTRCIyR3QiQzFUNkRTdDM6RDRFI3VyVlcyWCUlQ0YjNjU2Q2dEVFNTRyRlI7VElHVTVVNGk0JHJTQzQkQyVlV0NCVlRkhWYkQ0RVaDNYdFZHWEWFJEYpM0QjNjNVUzNCVzVkgzZGFzQkRZUzN2U1dUFGVWZTUzVUREZDciZEVVYVNjeCU0ZDdEhzIpU2RTOFRUQkWlk1OFRUVTN1MkZSM3ZFc1VDNnUmc2NKNUaUIzd3M0RWxEZTsiNENLVHU0NFUmQ2RWRFdCNUVENFkxZCEnRLQkNEU0RVNmVDQjl9ZmNkM1QVM0MzQkUjJlVHRkNEVWlENDVUIlUvRkM0RVY1UzY6OGVHVCRDIzRUUlUjM2RDWSVkVIU1U1ZiVFNlNDhTN1VWNTVEZ2RzNzVDQlY0ZUNENUM5NUdkRDJGYzRCUzIjRGR4UmJFI4GDRTUiQ0ZUhVY1ZEYoZSRoVDYnREYkQ1SUU0RWUycjp2RZIySVZkUmZDREZVJGQyVEc1JElBZENEU2VEQlVUUnNDQziLRTNidmNjVCtjRFU2Q0SGYzVHVpGTNoVDxFVSMlWTJFQyRJdV1EI3RDloYyNFQ0c1NVY0ZHVEY0dkM2QkQyVDVUVTNFUyamMUdSNrNz0mlFlERzZTSGhFRjVGM3NWU2NINDI2U1RERUhjY4FHNWNTVTV1U0U2I0VXNEZERWNDNUSjI1WmMmQ4U=",
|
||||
"AQgH+BUFUEUrZVRjM2IjMzJRESMlUnlTJjEjRhRlNBEyMSUpaGJTMjRCIzMTNCRENRdxNiNEZCQzNERYMiAyIiQmUTI+MhEzV1RWJoMjQjIySDN0QiYDUjUzNjRUVEYyQleDEiUmg0ERRjIjIzJUQjMxNlJGUTNDJFNTRzJiE1M0RjQzUzIiFDUmMjIzJWVCNENTIRJVODUzEkIVMhFEIjM0MkMyIRRCNFNxQyNCQ2UzOFQiJSM0EzU1V1M2EjhUVENDclZzImEiMTJBQlQiJCgyIyKkJSUlNBNDE2M3QSIyMicjMlJEUhJDJFQjJ0VSQ0QyYSFhZSNlQ4REUzVFIlOFRHIkYUJEM8RVMkMiMEczQwMlE1EkAlNiQlhCNkISRVI0ITUjRDU1JVNlK1QyGGRHQVM0NUVHQ1MkMyQoIzMzFCFUI0IhU1OIhCIlZUQVIUMyYzMlMUZ0RCKEIigUIlQ0QkQTM0MkM0QyJkUSM2I2tHJDUTQ0RBQ0YyNlUxUzIiIiMUiSMzUlJDNDQjM0ITQyNIM1MyNWM0MDOTZYVDRWIiZhMzc0NCJ0Q0NDZEMUElMyRyMmUhNiMkIZNjMkEyRTIzYkMzNUODUTNDJVM0ZTQjFCJCNWSTUlEiNCM1U2FCZUJzMVMyLjNkMhITVDEjIYMzNiVmIlO1VTMjMiVDQ2NTJFYyE0Q2IjRDN2IjRTRUVTFUVEYVKBVSMVJSFE0zOXNSJIqVElMVM4MiZEFSMhRlJEJUZnMycmQmQyJDl1JzVjMXQ0MzMjE1VUI1JDJUQyYRQ2JVZzQUJDM2IyInEkY1QiZTJEMRMiMxRVNEUjJUNkJHNSQiNCVCIyIjJUQlEhNUdFUhQzgkcSZaJUVUM0YiJEM2SjczUUIUIlQiM0RiQkIzZhRBJSRzQ0ZUI00UUSRSQlQmMkNINzODQhJFRTZ0FRQ3QTRhIzFTJFRBMmMzQzQhZENUMiIlV2VEMiNFRWQ1F1IyFXRSUyRTMqZ3I0YyhUNEJRMjISZRc2NDOEIjIxVGVWIXYyMiNCJBFDQSMhIzMjVFIDElgyJCUyVFgkRSQzIjJFQlNWRTQWMmQzFFOiMzVTZGMxNFZUNmIjRjETNUNURERTQjYVIkEzNEEyNDNTVUJSVzVkMjEyUlMjQ0RGgyFFNUQhRGMmRUQ2ZSOFETUYNlZCUhRiU2QhVUUiIlJDRjMhRVJDZxNSRTNBRCEoI0FGNUVRE0VFOGdCRDM2QkJCFSQhMxITRoE0VFIzVWUiUTNkRhNDMiMmIzRDQSNTFDoldaJDcnNjkSMJg3IkIiRENSQmciUhY2NFQ4RSNoJENkWDMmVCJGMxQjJGJScyNTJDVDNEEiZSMzQyIyVGRTNEIUw=",
|
||||
"AQgH+hQAFyMzlFVXNCNlRxRUYlRUUUZCMnRFJiR0WTgyZiRJZzRFQkVTVVVWc2ZFMlY1QkIxYUQTI0JDY1YkNEVENGUuQTRiNkQ0VUEzNkKUKLSIVkUhNiZURnRFMzcjVEBTdjVVVCIzJDM0hjc0RDVlVjRqMjJVZTNSM0QmQyMTRlNzVCNERFQyMxNBZHMiUSdYIUUjNlVjNzRyYWFHRHI3hKMnYnhFNCZOdlNUZBM0Q0clNTVBiEQRMUQzNSNVQ0IkEmZYNzIyNkRSUik2VBOVRCRDg0IilEMlcjRJMkJDSjRCJURTVDJBMmRTVBM1YyRRMSQoRDV2YzRDVCUkQWFFNDYnQ0IkUzRjRkQ1dGI0VUYzRERCQ1I2dFNhREOUUjJDc0NTN0JFNUZJRGFpU1Q0QyJlNiMzNCZSKFQzYnNUWTMiRGMiRWdSQzMiQnQ0QSgjVUMiE0hRM1NVUiZVIlRkRVMzI2VkRjQWQ1YyRiZWNHQXQ0UllUMSVTJDQzMkWCQiRFglMzIzKEYzJSJFMyREVIQlVFFlYzMDQyVWUZNCQlM0NUJFIkWiNnREdEJDImNWJDOIcmKyQzc5VDVRQ3PVNjQzIkJTQ3FzMjMyRFVFVTUlNUZEMzEjI0Q0M0Y2U1JTREQjIhZScUJjQkYhFRJFQyI0pTVmFTVlMkJXNDI1U3dFZkR2U0NCVRQyRih0UkIhckRUY0ZHSG00EiJUdVIxVjVGNnUVZCxEQkNTQjQ0IkZDciIkODYxM1MzRZRHQxVEZHZWJFIzRRZjVDNBMzI1Q1FEhUMiI0NkJWJWJDJzYlQiRSQjRoRiRhJTIjNSRVJEM1MiYmUiNBkjFkczRWU1SURIJUVDRFQ0QyZCUlRENEImE2FDQxRjlEdTI3RSNEU3RGJyWDNVMTVJNDM1QkJFQmNWRXUlcxNEQzNTGCtDUlNDMzMzY2VlcUQlaUIyZVMzA3NFM1NDc0JjZDUkQiFDY3QUczQzUkVDQjMiUWQ0NEQyNRVTMRJFM2RUMZNSQkQ0MkIiUgGCUkRig1UiElQkdDJFJDciVGIxMjQzI1UlNlRTM1JkRDc+RSM0VFUzMjWCU0RDMxJyJVJGI1VTEUQyM1R0I1c0NFNTM3MhIlUkNFIlZGNURkVURyNIVCMyYzQmQjITRkVHQ2NINGQ0Y0UW0icyUzMydEVBJVJIJENkUjRVIjQSNVYnEzVYMzUmYzGVNFRiQk0iVTVCM0RjJSMyRWRSQkURJBR0M0NzhnRlM3IzQxMTRDJjM1UVUkJCNUQTVGQlEzN0VDMyM0MmO2QoQzNSVURhFEAkU2IldINHRUU00zNFJVQxUkZEcVMyJSJkQjKFNCNUOzIYJEHEUyKCQjJESSY=",
|
||||
"AQcH/x0BbjQ2JTpUlkdFRERHVDRkWDU0RGR1ejRURHZ6IzUqdJN1M1VFQiNHI1NTI0J1VHOGZYVFVTRIRJVkVmUolWVShERjSDRVMlRlJDU2VFh3UmR1Mjg3K0M2SUY0Q0ZUspNiJEdZMmc3YkxGSERGOGdjgzNRVGM1Q1UnN0RHU1Y0WWUzRWVEJSRSeGQ0RlNFJVVJU3YoQkdEQ2M2MiVFUyJWRUVWNmRkM0NkVER2WXNkR0QlNGNEVlYzZSS4RDMyVEQ1ckRTM0ZoMlQ2tURGQ0OFQ0ZiY1ZFNEajdXEjVSI6ZWSjNHVRRTRVMldzUjm0NGU0dlhESFRDM0IzVCYkdjdlJJRFVDaHEzUkRmNWOEVXZTM0U0VkREdSUjRHVVViVCVFVUN0RDNDkl01VHMoNVQzYlZFZmNVVUNDQ1VjUiQ2NTV0UzZVModSNEY4Zpc2JjhjiFJVUGM0SHI0UzRTU1R2R0d3VENUZSQzRUZlY4d0aGNkhTQzWVZFZTZkJ2NEZaVDU1alJWpFJpRGRnIlZUU1ZUR2M1NzOkVEMzVjZERiVlRYSEkmU4RLM0RTQ2Q2RjM3RTNhdVVEQzRXJUZTRmM1OEZTYyJkRGRjZDVTlDhSMzdXQiU1RFUiIoRpVGlXIjY1UVVjc0RDJDNSM0NVJTNkRUU1U0lDdEVXY2NGVVNJVmJJRTREVVNiMyVIQ3U6O0U0M0MzZFVVIzJmNERWJaJjikIlRXk1hFQ2NEU0RUN4UzdENEsVgzZFVidXUnU2VRZFRUQmZmRERCQ0ZER2Q3YnZFNlVpJUkzZVREKFWEUzVVMzYzQzQhfTYzQ0IlI5UoV0RGJCVXSDkyZCRSU3ITUkNoYzJUMkYzhlVVRTNyaDNmQzRDVVRjNkVUhEJyRBR2JlOEREVUU0RjY4Nkc3ZERGUyVDNFZGNFOTY3U1OKNlkjQy1TVlRTQ0M1REU2QhgzUzUzOWlWQ1Z3RTQzIzc7RXVkI0M4NCNYRVRGNEZbhFEyVJI0R1OUZEQ3VUVEQlU1NkNYJEYzdSQ0ZSNGeEWIVVU3KEVFY1RZQ0JSNEJFNFMyM0UzN0hHNTQjMlRGNkiEMyVjRFNVRXNkZGM2M4hENCMnU1VWQjNFRkO2VmO1RndEVzWTQiiHQ0NzM2clM4NjQxpjQjZEVTNEpEdlREJzc3OjZnRlNFNWJVNFeDokNCRmQ5NURJVUZSJyRDRXikVURVITZDNGW0ITNEOUQ0RUklZDQjYjVENURDRCRmRDU1hCY2VTR0RGIzJSZzlSczdTFJJkRlZyU1M1JTdVhDYhVFczQ0hTRIc0RCNDdUJEQxNlZEQ2ZEUiJJRFU3YzVGRER0R2ZlNFOTU1MyRGI0RzMkQ2Q="
|
||||
);
|
||||
|
||||
List<HyperLogLogCollector> collectors = Lists.transform(
|
||||
objects, new Function<String, HyperLogLogCollector>()
|
||||
{
|
||||
@Nullable
|
||||
@Override
|
||||
public HyperLogLogCollector apply(
|
||||
@Nullable String s
|
||||
)
|
||||
{
|
||||
return HyperLogLogCollector.makeCollector(
|
||||
ByteBuffer.wrap(Base64.decodeBase64(s))
|
||||
);
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
Collection<List<HyperLogLogCollector>> permutations = Collections2.permutations(collectors);
|
||||
|
||||
for(List<HyperLogLogCollector> permutation : permutations) {
|
||||
HyperLogLogCollector collector = HyperLogLogCollector.makeLatestCollector();
|
||||
|
||||
for (HyperLogLogCollector foldee : permutation) {
|
||||
collector.fold(foldee);
|
||||
}
|
||||
Assert.assertEquals(29, collector.getMaxOverflowValue());
|
||||
Assert.assertEquals(366, collector.getMaxOverflowRegister());
|
||||
Assert.assertEquals(1.0429189446653817E7, collector.estimateCardinality(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Provides a nice printout of error rates as a function of cardinality
|
||||
@Ignore @Test
|
||||
public void showErrorRate() throws Exception
|
||||
|
|
Loading…
Reference in New Issue