SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision is selected and a large number of values exist in each shard

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1697969 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2015-08-26 16:33:39 +00:00
parent b5d5db70a0
commit 942750c33f
4 changed files with 191 additions and 49 deletions

View File

@ -159,6 +159,10 @@ Bug Fixes
* SOLR-7929: SimplePostTool (also bin/post) -filetypes "*" now works properly in 'web' mode (Erik Hatcher)
* SOLR-7954: Fixed an integer overflow bug in the HyperLogLog code used by the 'cardinality' option
of stats.field to prevent ArrayIndexOutOfBoundsException in a distributed search when a large precision
is selected and a large number of values exist in each shard (hossman)
Optimizations
----------------------

View File

@ -99,7 +99,7 @@ class BigEndianAscendingWordDeserializer implements IWordDeserializer {
}
// First bit of the word
final long firstBitIndex = (position * wordLength);
final long firstBitIndex = ((long)position) * ((long)wordLength);
final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);

View File

@ -85,7 +85,7 @@ class BigEndianAscendingWordSerializer implements IWordSerializer {
this.wordLength = wordLength;
this.wordCount = wordCount;
final long bitsRequired = (wordLength * wordCount);
final long bitsRequired = ((long)wordLength) * ((long)wordCount);
final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
bytes = new byte[bytesRequired];

View File

@ -18,6 +18,8 @@
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
@ -27,6 +29,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.EnumSet;
import static org.apache.solr.util.hll.HLL.*;
@ -34,55 +37,190 @@ import static org.apache.solr.util.hll.HLL.*;
* Serialization smoke-tests.
*/
public class HLLSerializationTest extends LuceneTestCase {
/**
* A smoke-test that covers serialization/deserialization of an HLL
* under all possible parameters.
*/
@Test
@Slow
@Nightly
public void serializationSmokeTest() throws Exception {
final Random random = new Random(randomLong());
final int randomCount = 250;
final List<Long> randoms = new ArrayList<Long>(randomCount);
for (int i=0; i<randomCount; i++) {
randoms.add(random.nextLong());
}
assertCardinality(HLLType.EMPTY, randoms);
assertCardinality(HLLType.EXPLICIT, randoms);
assertCardinality(HLLType.SPARSE, randoms);
assertCardinality(HLLType.FULL, randoms);
/**
* A smoke-test that covers serialization/deserialization of an HLL
* under most possible init parameters.
*/
@Test
@Slow
@Nightly
public void serializationSmokeTest() throws Exception {
final Random random = new Random(randomLong());
final int randomCount = 250;
final List<Long> randoms = new ArrayList<Long>(randomCount);
for (int i=0; i<randomCount; i++) {
randoms.add(random.nextLong());
}
// NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
// completes in a reasonable amount of time. Not much is gained by
// testing larger values - there are no more known serialization
// related edge cases that appear as log2m gets even larger.
// NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
// on 2014-01-30.
private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
throws CloneNotSupportedException {
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
for(final boolean sparse: new boolean[]{true, false}) {
HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
for(final Long item: items) {
hll.addRaw(item);
}
HLL copy = HLL.fromBytes(hll.toBytes());
assertEquals(copy.cardinality(), hll.cardinality());
assertEquals(copy.getType(), hll.getType());
assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
HLL clone = hll.clone();
assertEquals(clone.cardinality(), hll.cardinality());
assertEquals(clone.getType(), hll.getType());
assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
}
}
}
}
// testing larger values
final int maxLog2m = 16;
for (HLLType type : EnumSet.allOf(HLLType.class)) {
assertCardinality(type, maxLog2m, randoms);
}
}
/**
* A smoke-test that covers serialization/deserialization of HLLs
* under the max possible numeric init parameters, iterating over all possible combinations of
* the other params.
*
* @see #manyValuesHLLSerializationTest
*/
@Test
@Slow
@Monster("needs roughly -Dtests.heapsize=8g because of the (multiple) massive data structs")
public void monsterHLLSerializationTest() throws Exception {
final Random random = new Random(randomLong());
final int randomCount = 250;
final List<Long> randoms = new ArrayList<Long>(randomCount);
for (int i=0; i<randomCount; i++) {
randoms.add(random.nextLong());
}
for (HLLType type : EnumSet.allOf(HLLType.class)) {
for (boolean sparse : new boolean[] {true, false} ) {
HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM,
sparse, type);
assertCardinality(hll, randoms);
}
}
}
/**
* A smoke-test that covers serialization/deserialization of a (single) HLL
* with random init params with an extremely large number of unique values added to it.
*
* @see #monsterHLLSerializationTest
*/
@Test
@Slow
@Monster("may require as much as -Dtests.heapsize=4g depending on random values picked")
public void manyValuesHLLSerializationTest() throws Exception {
final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
Arrays.sort(ALL_TYPES);
final int log2m = TestUtil.nextInt(random(), MINIMUM_LOG2M_PARAM, MAXIMUM_LOG2M_PARAM);
final int regwidth = TestUtil.nextInt(random(), MINIMUM_REGWIDTH_PARAM, MAXIMUM_REGWIDTH_PARAM);
final int expthresh = TestUtil.nextInt(random(), MINIMUM_EXPTHRESH_PARAM, MAXIMUM_EXPTHRESH_PARAM);
final boolean sparse = random().nextBoolean();
final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
HLL hll = new HLL(log2m, regwidth, expthresh, sparse, type);
final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
final long MAX_VAL = MIN_VAL + NUM_VALS;
assert MIN_VAL < MAX_VAL;
for (long val = MIN_VAL; val < MAX_VAL; val++) {
hll.addRaw(val);
}
final long expectedCardinality = hll.cardinality();
final HLLType expectedType = hll.getType();
byte[] serializedData = hll.toBytes();
hll = null; // allow some GC
HLL copy = HLL.fromBytes(serializedData);
serializedData = null; // allow some GC
assertEquals(expectedCardinality, copy.cardinality());
assertEquals(expectedType, copy.getType());
}
/**
* A smoke-test that covers serialization/deserialization of a (single) HLL
* with random the max possible numeric init parameters, with randomized values for the other params.
*
* @see #monsterHLLSerializationTest
*/
@Test
@Slow
@Monster("can require as much as -Dtests.heapsize=4g because of the massive data structs")
public void manyValuesMonsterHLLSerializationTest() throws Exception {
final HLLType[] ALL_TYPES = EnumSet.allOf(HLLType.class).toArray(new HLLType[0]);
Arrays.sort(ALL_TYPES);
final boolean sparse = random().nextBoolean();
final HLLType type = ALL_TYPES[TestUtil.nextInt(random(), 0, ALL_TYPES.length-1)];
HLL hll = new HLL(MAXIMUM_LOG2M_PARAM, MAXIMUM_REGWIDTH_PARAM, MAXIMUM_EXPTHRESH_PARAM, sparse, type);
final long NUM_VALS = TestUtil.nextLong(random(), 150000, 1000000);
final long MIN_VAL = TestUtil.nextLong(random(), Long.MIN_VALUE, Long.MAX_VALUE-NUM_VALS);
final long MAX_VAL = MIN_VAL + NUM_VALS;
assert MIN_VAL < MAX_VAL;
for (long val = MIN_VAL; val < MAX_VAL; val++) {
hll.addRaw(val);
}
final long expectedCardinality = hll.cardinality();
final HLLType expectedType = hll.getType();
byte[] serializedData = hll.toBytes();
hll = null; // allow some GC
HLL copy = HLL.fromBytes(serializedData);
serializedData = null; // allow some GC
assertEquals(expectedCardinality, copy.cardinality());
assertEquals(expectedType, copy.getType());
}
/**
* Iterates over all possible constructor args, with the exception of log2m,
* which is only iterated up to the specified max so the test runs in a
* "reasonable" amount of time and ram.
*/
private static void assertCardinality(final HLLType hllType,
final int maxLog2m,
final Collection<Long> items) throws CloneNotSupportedException {
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
for(final boolean sparse: new boolean[]{true, false}) {
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=maxLog2m; log2m++) {
assertCardinality(new HLL(log2m, regw, expthr, sparse, hllType), items);
}
}
}
}
}
/**
* Adds all of the items to the specified hll, then does a round trip serialize/deserialize and confirms
* equality of several properties (including the byte serialization). Repeats process with a clone.
*/
private static void assertCardinality(HLL hll, final Collection<Long> items)
throws CloneNotSupportedException {
for (final Long item: items) {
hll.addRaw(item);
}
final long hllCardinality = hll.cardinality();
final HLLType hllType = hll.getType();
final byte[] hllBytes = hll.toBytes();
hll = null; // allow some GC
HLL copy = HLL.fromBytes(hllBytes);
assertEquals(copy.cardinality(), hllCardinality);
assertEquals(copy.getType(), hllType);
assertTrue(Arrays.equals(copy.toBytes(), hllBytes));
HLL clone = copy.clone();
copy = null; // allow some GC
assertEquals(clone.cardinality(), hllCardinality);
assertEquals(clone.getType(), hllType);
assertTrue(Arrays.equals(clone.toBytes(), hllBytes));
}
}