mirror of https://github.com/apache/lucene.git
SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision is selected and a large number of values exist in each shard
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1697969 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b5d5db70a0
commit
942750c33f
|
@ -159,6 +159,10 @@ Bug Fixes
|
|||
|
||||
* SOLR-7929: SimplePostTool (also bin/post) -filetypes "*" now works properly in 'web' mode (Erik Hatcher)
|
||||
|
||||
* SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option
|
||||
of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision
|
||||
is selected and a large number of values exist in each shard (hossman)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ class BigEndianAscendingWordDeserializer implements IWordDeserializer {
|
|||
}
|
||||
|
||||
// First bit of the word
|
||||
final long firstBitIndex = (position * wordLength);
|
||||
final long firstBitIndex = ((long)position) * ((long)wordLength);
|
||||
final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
|
||||
final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ class BigEndianAscendingWordSerializer implements IWordSerializer {
|
|||
this.wordLength = wordLength;
|
||||
this.wordCount = wordCount;
|
||||
|
||||
final long bitsRequired = (wordLength * wordCount);
|
||||
final long bitsRequired = ((long)wordLength) * ((long)wordCount);
|
||||
final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
|
||||
final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
|
||||
bytes = new byte[bytesRequired];
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
|
@ -27,6 +29,7 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.EnumSet;
|
||||
|
||||
import static org.apache.solr.util.hll.HLL.*;
|
||||
|
||||
|
@ -34,55 +37,190 @@ import static org.apache.solr.util.hll.HLL.*;
|
|||
* Serialization smoke-tests.
|
||||
*/
|
||||
public class HLLSerializationTest extends LuceneTestCase {
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of an HLL
|
||||
* under all possible parameters.
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Nightly
|
||||
public void serializationSmokeTest() throws Exception {
|
||||
final Random random = new Random(randomLong());
|
||||
final int randomCount = 250;
|
||||
final List<Long> randoms = new ArrayList<Long>(randomCount);
|
||||
for (int i=0; i<randomCount; i++) {
|
||||
randoms.add(random.nextLong());
|
||||
}
|
||||
|
||||
assertCardinality(HLLType.EMPTY, randoms);
|
||||
assertCardinality(HLLType.EXPLICIT, randoms);
|
||||
assertCardinality(HLLType.SPARSE, randoms);
|
||||
assertCardinality(HLLType.FULL, randoms);
|
||||
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of an HLL
|
||||
* under most possible init parameters.
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Nightly
|
||||
public void serializationSmokeTest() throws Exception {
|
||||
final Random random = new Random(randomLong());
|
||||
final int randomCount = 250;
|
||||
final List<Long> randoms = new ArrayList<Long>(randomCount);
|
||||
for (int i=0; i<randomCount; i++) {
|
||||
randoms.add(random.nextLong());
|
||||
}
|
||||
|
||||
|
||||
// NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
|
||||
// completes in a reasonable amount of time. Not much is gained by
|
||||
// testing larger values - there are no more known serialization
|
||||
// related edge cases that appear as log2m gets even larger.
|
||||
// NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
|
||||
// on 2014-01-30.
|
||||
private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
|
||||
throws CloneNotSupportedException {
|
||||
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
|
||||
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
|
||||
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
|
||||
for(final boolean sparse: new boolean[]{true, false}) {
|
||||
HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
|
||||
for(final Long item: items) {
|
||||
hll.addRaw(item);
|
||||
}
|
||||
HLL copy = HLL.fromBytes(hll.toBytes());
|
||||
assertEquals(copy.cardinality(), hll.cardinality());
|
||||
assertEquals(copy.getType(), hll.getType());
|
||||
assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
|
||||
|
||||
HLL clone = hll.clone();
|
||||
assertEquals(clone.cardinality(), hll.cardinality());
|
||||
assertEquals(clone.getType(), hll.getType());
|
||||
assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// testing larger values
|
||||
final int maxLog2m = 16;
|
||||
for (HLLType type : EnumSet.allOf(HLLType.class)) {
|
||||
assertCardinality(type, maxLog2m, randoms);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of HLLs
|
||||
* under the max possible numeric init parameters, iterating over all possible combinations of
|
||||
* the other params.
|
||||
*
|
||||
* @see #manyValuesHLLSerializationTest
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Monster("needs roughly -Dtests.heapsize=8g because of the (multiple) massive data structs")
|
||||
public void monsterHLLSerializationTest() throws Exception {
|
||||
final Random random = new Random(randomLong());
|
||||
final int randomCount = 250;
|
||||
final List<Long> randoms = new ArrayList<Long>(randomCount);
|
||||
for (int i=0; i<randomCount; i++) {
|
||||
randoms.add(random.nextLong());
|
||||
}
|
||||
|
||||
for (HLLType type : EnumSet.allOf(HLLType.class)) {
|
||||
for (boolean sparse : new boolean[] {true, false} ) {
|
||||
HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM,
|
||||
sparse, type);
|
||||
assertCardinality(hll, randoms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of a (single) HLL
|
||||
* with random init params with an extremely large number of unique values added to it.
|
||||
*
|
||||
* @see #monsterHLLSerializationTest
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Monster("may require as much as -Dtests.heapsize=4g depending on random values picked")
|
||||
public void manyValuesHLLSerializationTest() throws Exception {
|
||||
|
||||
final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
|
||||
Arrays.sort(ALL_TYPES);
|
||||
|
||||
final int log2m = TestUtil.nextInt(random(), MINIMUM_LOG2M_PARAM, MAXIMUM_LOG2M_PARAM);
|
||||
final int regwidth = TestUtil.nextInt(random(), MINIMUM_REGWIDTH_PARAM, MAXIMUM_REGWIDTH_PARAM);
|
||||
final int expthresh = TestUtil.nextInt(random(), MINIMUM_EXPTHRESH_PARAM, MAXIMUM_EXPTHRESH_PARAM);
|
||||
final boolean sparse = random().nextBoolean();
|
||||
final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
|
||||
|
||||
HLL hll = new HLL(log2m, regwidth, expthresh, sparse, type);
|
||||
|
||||
final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
|
||||
final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
|
||||
final long MAX_VAL = MIN_VAL + NUM_VALS;
|
||||
assert MIN_VAL < MAX_VAL;
|
||||
|
||||
for (long val = MIN_VAL; val < MAX_VAL; val++) {
|
||||
hll.addRaw(val);
|
||||
}
|
||||
|
||||
final long expectedCardinality = hll.cardinality();
|
||||
final HLLType expectedType = hll.getType();
|
||||
|
||||
byte[] serializedData = hll.toBytes();
|
||||
hll = null; // allow some GC
|
||||
|
||||
HLL copy = HLL.fromBytes(serializedData);
|
||||
serializedData = null; // allow some GC
|
||||
|
||||
assertEquals(expectedCardinality, copy.cardinality());
|
||||
assertEquals(expectedType, copy.getType());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of a (single) HLL
|
||||
* with random the max possible numeric init parameters, with randomized values for the other params.
|
||||
*
|
||||
* @see #monsterHLLSerializationTest
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Monster("can require as much as -Dtests.heapsize=4g because of the massive data structs")
|
||||
public void manyValuesMonsterHLLSerializationTest() throws Exception {
|
||||
|
||||
final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
|
||||
Arrays.sort(ALL_TYPES);
|
||||
|
||||
final boolean sparse = random().nextBoolean();
|
||||
final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
|
||||
|
||||
HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM, sparse, type);
|
||||
|
||||
final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
|
||||
final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
|
||||
final long MAX_VAL = MIN_VAL + NUM_VALS;
|
||||
assert MIN_VAL < MAX_VAL;
|
||||
|
||||
for (long val = MIN_VAL; val < MAX_VAL; val++) {
|
||||
hll.addRaw(val);
|
||||
}
|
||||
|
||||
final long expectedCardinality = hll.cardinality();
|
||||
final HLLType expectedType = hll.getType();
|
||||
|
||||
byte[] serializedData = hll.toBytes();
|
||||
hll = null; // allow some GC
|
||||
|
||||
HLL copy = HLL.fromBytes(serializedData);
|
||||
serializedData = null; // allow some GC
|
||||
|
||||
assertEquals(expectedCardinality, copy.cardinality());
|
||||
assertEquals(expectedType, copy.getType());
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates over all possible constructor args, with the exception of log2m,
|
||||
* which is only iterated up to the specified max so the test runs in a
|
||||
* "reasonable" amount of time and ram.
|
||||
*/
|
||||
private static void assertCardinality(final HLLType hllType,
|
||||
final int maxLog2m,
|
||||
final Collection<Long> items) throws CloneNotSupportedException {
|
||||
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
|
||||
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
|
||||
for(final boolean sparse: new boolean[]{true, false}) {
|
||||
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=maxLog2m; log2m++) {
|
||||
assertCardinality(new HLL(log2m, regw, expthr, sparse, hllType), items);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds all of the items to the specified hll, then does a round trip serialize/deserialize and confirms
|
||||
* equality of several properties (including the byte serialization). Repeats process with a clone.
|
||||
*/
|
||||
private static void assertCardinality(HLL hll, final Collection<Long> items)
|
||||
throws CloneNotSupportedException {
|
||||
|
||||
for (final Long item: items) {
|
||||
hll.addRaw(item);
|
||||
}
|
||||
|
||||
final long hllCardinality = hll.cardinality();
|
||||
final HLLType hllType = hll.getType();
|
||||
final byte[] hllBytes = hll.toBytes();
|
||||
hll = null; // allow some GC
|
||||
|
||||
HLL copy = HLL.fromBytes(hllBytes);
|
||||
assertEquals(copy.cardinality(), hllCardinality);
|
||||
assertEquals(copy.getType(), hllType);
|
||||
assertTrue(Arrays.equals(copy.toBytes(), hllBytes));
|
||||
|
||||
HLL clone = copy.clone();
|
||||
copy = null; // allow some GC
|
||||
|
||||
assertEquals(clone.cardinality(), hllCardinality);
|
||||
assertEquals(clone.getType(), hllType);
|
||||
assertTrue(Arrays.equals(clone.toBytes(), hllBytes));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue