Ported tests as well.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr7787@1691351 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2015-07-16 10:47:37 +00:00
parent 1842589815
commit 139460e8c5
10 changed files with 2643 additions and 0 deletions

View File

@ -0,0 +1,191 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import java.util.Random;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
/**
* Unit and smoke tests for {@link BigEndianAscendingWordDeserializer}.
*
* @author timon
*/
public class BigEndianAscendingWordDeserializerTest extends LuceneTestCase {
/**
* Error checking tests for constructor.
*/
@Test
public void constructorErrorTest() {
// word length too small
try {
new BigEndianAscendingWordDeserializer(0/*wordLength, below minimum of 1*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
fail("Should complain about too-short words.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Word length must be"));
}
// word length too large
try {
new BigEndianAscendingWordDeserializer(65/*wordLength, above maximum of 64*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
fail("Should complain about too-long words.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Word length must be"));
}
// byte padding negative
try {
new BigEndianAscendingWordDeserializer(5/*wordLength, arbitrary*/, -1/*bytePadding, too small*/, new byte[1]/*bytes, arbitrary, not used here*/);
fail("Should complain about negative byte padding.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Byte padding must be"));
}
}
/**
* Smoke test using 64-bit short words and special word values.
*/
@Test
public void smokeTest64BitWord() {
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(64/*wordLength*/,
5/*wordCount*/,
0/*bytePadding, arbitrary*/);
// Check that the sign bit is being preserved.
serializer.writeWord(-1L);
serializer.writeWord(-112894714L);
// Check "special" values
serializer.writeWord(0L);
serializer.writeWord(Long.MAX_VALUE);
serializer.writeWord(Long.MIN_VALUE);
final byte[] bytes = serializer.getBytes();
final BigEndianAscendingWordDeserializer deserializer =
new BigEndianAscendingWordDeserializer(64/*wordLength*/, 0/*bytePadding*/, bytes);
assertEquals(deserializer.totalWordCount(), 5/*wordCount*/);
assertEquals(deserializer.readWord(), -1L);
assertEquals(deserializer.readWord(), -112894714L);
assertEquals(deserializer.readWord(), 0L);
assertEquals(deserializer.readWord(), Long.MAX_VALUE);
assertEquals(deserializer.readWord(), Long.MIN_VALUE);
}
/**
* A smoke/fuzz test for ascending (from zero) word values.
*/
@Test
public void ascendingSmokeTest() {
for(int wordLength=5; wordLength<65; wordLength++) {
runAscendingTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
}
}
/**
* A smoke/fuzz test for random word values.
*/
@Test
public void randomSmokeTest() {
for(int wordLength=5; wordLength<65; wordLength++) {
runRandomTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
}
}
// ------------------------------------------------------------------------
/**
* Runs a test which serializes and deserializes random word values.
*
* @param wordLength the length of words to test
* @param bytePadding the number of bytes padding the byte array
* @param wordCount the number of word values to test
*/
private static void runRandomTest(final int wordLength, final int bytePadding, final int wordCount) {
final long seed = randomLong();
final Random random = new Random(seed);
final Random verificationRandom = new Random(seed);
final long wordMask;
if(wordLength == 64) {
wordMask = ~0L;
} else {
wordMask = (1L << wordLength) - 1L;
}
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
wordCount,
bytePadding/*bytePadding, arbitrary*/);
for(int i=0; i<wordCount; i++) {
final long value = random.nextLong() & wordMask;
serializer.writeWord(value);
}
final byte[] bytes = serializer.getBytes();
final BigEndianAscendingWordDeserializer deserializer =
new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
assertEquals(deserializer.totalWordCount(), wordCount);
for(int i=0; i<wordCount; i++) {
assertEquals(deserializer.readWord(), (verificationRandom.nextLong() & wordMask));
}
}
/**
* Runs a test which serializes and deserializes ascending (from zero) word values.
*
* @param wordLength the length of words to test
* @param bytePadding the number of bytes padding the byte array
* @param wordCount the number of word values to test
*/
private static void runAscendingTest(final int wordLength, final int bytePadding, final int wordCount) {
final long wordMask;
if(wordLength == 64) {
wordMask = ~0L;
} else {
wordMask = (1L << wordLength) - 1L;
}
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
wordCount,
bytePadding/*bytePadding, arbitrary*/);
for(long i=0; i<wordCount; i++) {
serializer.writeWord(i & wordMask);
}
final byte[] bytes = serializer.getBytes();
final BigEndianAscendingWordDeserializer deserializer =
new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
assertEquals(deserializer.totalWordCount(), wordCount);
for(long i=0; i<wordCount; i++) {
assertEquals(deserializer.readWord(), i & wordMask);
}
}
}

View File

@ -0,0 +1,337 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import java.util.Arrays;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
/**
* Unit tests for {@link BigEndianAscendingWordSerializer}.
*/
public class BigEndianAscendingWordSerializerTest extends LuceneTestCase {
/**
* Error checking tests for constructor.
*/
@Test
public void constructorErrorTest() {
// word length too small
try {
new BigEndianAscendingWordSerializer(0/*wordLength, below minimum of 1*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
fail("Should complain about too-short words.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Word length must be"));
}
// word length too large
try {
new BigEndianAscendingWordSerializer(65/*wordLength, above max of 64*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
fail("Should complain about too-long words.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Word length must be"));
}
// word count negative
try {
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, -1/*wordCount, too small*/, 0/*bytePadding, arbitrary*/);
fail("Should complain about negative word count.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Word count must be"));
}
// byte padding negative
try {
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, 1/*wordCount, arbitrary*/, -1/*bytePadding, too small*/);
fail("Should complain about negative byte padding.");
} catch(final IllegalArgumentException e) {
assertTrue(e.getMessage().contains("Byte padding must be"));
}
}
/**
* Tests runtime exception thrown at premature call to {@link BigEndianAscendingWordSerializer#getBytes()}.
*/
@Test
public void earlyGetBytesTest() {
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/,
1/*wordCount*/,
0/*bytePadding, arbitrary*/);
// getBytes without enough writeWord should throw
try {
serializer.getBytes();
fail("Should throw.");
} catch(final RuntimeException e) {
assertTrue(e.getMessage().contains("Not all words"));
}
}
/**
*/
@Test
public void smokeTestExplicitParams() {
final int shortWordLength = 64/*longs used in LongSetSlab*/;
{// Should work on an empty sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
0/*wordCount*/,
0/*bytePadding, none*/);
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
}
{// Should work on a byte-divisible sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
2/*wordCount*/,
0/*bytePadding, none*/);
serializer.writeWord(0xBAAAAAAAAAAAAAACL);
serializer.writeWord(0x8FFFFFFFFFFFFFF1L);
// Bytes:
// ======
// 0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC
// 0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1
//
// -70 -86 ... -84
// -113 -1 ... -15
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { -70, -86, -86, -86, -86, -86, -86, -84,
-113, -1, -1, -1, -1, -1, -1, -15 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
{// Should pad the array correctly.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
1/*wordCount*/,
1/*bytePadding*/);
serializer.writeWord(1);
// 1 byte leading padding | value 1 | trailing padding
// 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001
// 0x00 | 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x01
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
}
/**
* Smoke test for typical parameters used in practice.
*/
@Test
public void smokeTestProbabilisticParams() {
// XXX: revisit this
final int shortWordLength = 5;
{// Should work on an empty sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
0/*wordCount*/,
0/*bytePadding, none*/);
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
}
{// Should work on a non-byte-divisible sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
3/*wordCount*/,
0/*bytePadding, none*/);
serializer.writeWord(9);
serializer.writeWord(31);
serializer.writeWord(1);
// The values:
// -----------
// 9 |31 |1 |padding
// Corresponding bits:
// ------------------
// 0100 1|111 11|00 001|0
// And the hex/decimal (remember Java bytes are signed):
// -----------------------------------------------------
// 0100 1111 -> 0x4F -> 79
// 1100 0010 -> 0xC2 -> -62
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 79, -62 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
{// Should work on a byte-divisible sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
8/*wordCount*/,
0/*bytePadding, none*/);
for(int i=1; i<9; i++) {
serializer.writeWord(i);
}
// Values: 1-8
// Corresponding bits:
// ------------------
// 00001
// 00010
// 00011
// 00100
// 00101
// 00110
// 00111
// 01000
// And the hex:
// ------------
// 0000 1000 => 0x08 => 8
// 1000 0110 => 0x86 => -122
// 0100 0010 => 0x62 => 66
// 1001 1000 => 0x98 => -104
// 1110 1000 => 0xE8 => -24
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
{// Should pad the array correctly.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
1/*wordCount*/,
1/*bytePadding*/);
serializer.writeWord(1);
// 1 byte leading padding | value 1 | trailing padding
// 0000 0000 | 0000 1|000
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 0, 8 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
}
/**
* Smoke test for typical parameters used in practice.
*/
@Test
public void smokeTestSparseParams() {
// XXX: revisit
final int shortWordLength = 17;
{// Should work on an empty sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
0/*wordCount*/,
0/*bytePadding, none*/);
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
}
{// Should work on a non-byte-divisible sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
3/*wordCount*/,
0/*bytePadding, none*/);
serializer.writeWord(9);
serializer.writeWord(42);
serializer.writeWord(75);
// The values:
// -----------
// 9 |42 |75 |padding
// Corresponding bits:
// ------------------
// 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
// And the hex/decimal (remember Java bytes are signed):
// -----------------------------------------------------
// 0000 0000 -> 0x00 -> 0
// 0000 0100 -> 0x04 -> 4
// 1000 0000 -> 0x80 -> -128
// 0000 1010 -> 0x0A -> 10
// 1000 0000 -> 0x80 -> -128
// 0000 1001 -> 0x09 -> 9
// 0110 0000 -> 0x60 -> 96
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
{// Should work on a byte-divisible sequence, with no padding.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
8/*wordCount*/,
0/*bytePadding, none*/);
for(int i=1; i<9; i++) {
serializer.writeWord(i);
}
// Values: 1-8
// Corresponding bits:
// ------------------
// 0000 0000 0000 0000 1
// 000 0000 0000 0000 10
// 00 0000 0000 0000 011
// 0 0000 0000 0000 0100
// 0000 0000 0000 0010 1
// 000 0000 0000 0001 10
// 00 0000 0000 0000 111
// 0 0000 0000 0000 1000
// And the hex:
// ------------
// 0000 0000 -> 0x00 -> 0
// 0000 0000 -> 0x00 -> 0
// 1000 0000 -> 0x80 -> -128
// 0000 0000 -> 0x00 -> 0
// 1000 0000 -> 0x80 -> -128
// 0000 0000 -> 0x00 -> 0
// 0110 0000 -> 0x60 -> 96
// 0000 0000 -> 0x00 -> 0
// 0100 0000 -> 0x40 -> 64
// 0000 0000 -> 0x00 -> 0
// 0010 1000 -> 0x28 -> 40
// 0000 0000 -> 0x00 -> 0
// 0001 1000 -> 0x18 -> 24
// 0000 0000 -> 0x00 -> 0
// 0000 1110 -> 0x0D -> 14
// 0000 0000 -> 0x00 -> 0
// 0000 1000 -> 0x08 -> 8
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
{// Should pad the array correctly.
final BigEndianAscendingWordSerializer serializer =
new BigEndianAscendingWordSerializer(shortWordLength,
1/*wordCount*/,
1/*bytePadding*/);
serializer.writeWord(1);
// 1 byte leading padding | value 1 | trailing padding
// 0000 0000 | 0000 0000 0000 0000 1|000 0000
// 0x00 0x00 0x00 0x80
final byte[] bytes = serializer.getBytes();
final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 };
assertTrue(Arrays.equals(bytes, expectedBytes));
}
}
}

View File

@ -0,0 +1,167 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
/**
* Unit tests for {@link BitVector}.
*/
public class BitVectorTest extends LuceneTestCase {
/**
* Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}.
*/
@Test
public void getSetRegisterTest() {
{ // locally scoped for sanity
// NOTE: registers are only 5bits wide
final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
for(int i=0; i<128/*2^7*/; i++) {
vector1.setRegister(i, 0x1F);
vector2.setRegister(i, (i & 0x1F));
vector3.setRegister(i, ((127 - i) & 0x1F));
vector4.setRegister(i, 0x15);
}
for(int i=0; i<128/*2^7*/; i++) {
assertEquals(vector1.getRegister(i), 0x1F);
assertEquals(vector2.getRegister(i), (i & 0x1F));
assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F));
assertEquals(vector4.getRegister(i), 0x15);
}
}
}
// ========================================================================
/**
* Tests {@link BitVector#registerIterator()}
*/
@Test
public void registerIteratorTest() {
{ // scoped locally for sanity
// NOTE: registers are only 5bits wide
final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
for(int i=0; i<128/*2^7*/; i++) {
vector1.setRegister(i, 0x1F);
vector2.setRegister(i, (i & 0x1F));
vector3.setRegister(i, ((127 - i) & 0x1F));
vector4.setRegister(i, 0x15);
}
final LongIterator registerIterator1 = vector1.registerIterator();
final LongIterator registerIterator2 = vector2.registerIterator();
final LongIterator registerIterator3 = vector3.registerIterator();
final LongIterator registerIterator4 = vector4.registerIterator();
for(int i=0; i<128/*2^7*/; i++) {
assertEquals(registerIterator1.hasNext(), true);
assertEquals(registerIterator2.hasNext(), true);
assertEquals(registerIterator3.hasNext(), true);
assertEquals(registerIterator4.hasNext(), true);
assertEquals(registerIterator1.next(), 0x1F);
assertEquals(registerIterator2.next(), (i & 0x1F));
assertEquals(registerIterator3.next(), ((127 - i) & 0x1F));
assertEquals(registerIterator4.next(), 0x15);
}
assertEquals(registerIterator1.hasNext(), false/*no more*/);
assertEquals(registerIterator2.hasNext(), false/*no more*/);
assertEquals(registerIterator3.hasNext(), false/*no more*/);
assertEquals(registerIterator4.hasNext(), false/*no more*/);
}
{ // scoped locally for sanity
// Vectors that are shorter than one word
assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */);
assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */);
assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */);
assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */);
// Vectors that don't fit exactly into longs
assertIterator(5, 16/* 5*16=80 bits */);
assertIterator(5, 32/* 5*32=160 bits */);
}
// Iterate over vectors that are padded
}
private static void assertIterator(final int width, final int count) {
final BitVector vector = new BitVector(width, count);
final LongIterator iter = vector.registerIterator();
for(int i=0; i<count; i++) {
assertTrue(String.format("expected more elements: width=%s, count=%s", width, count), iter.hasNext());
// TODO: fill with a sentinel value
assertEquals(iter.next(), 0);
}
assertFalse(String.format("expected no more elements: width=%s, count=%s", width, count), iter.hasNext());
}
// ========================================================================
/**
* Tests {@link BitVector#setMaxRegister(long, long)}
*/
@Test
public void setMaxRegisterTest() {
final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
vector.setRegister(0, 10);
// should replace with a larger value
vector.setMaxRegister(0, 11);
assertEquals(vector.getRegister(0), 11);
// should not replace with a smaller or equal value
vector.setMaxRegister(0, 9);
assertEquals(vector.getRegister( 0), 11);
vector.setMaxRegister(0, 11);
assertEquals(vector.getRegister(0), 11);
}
// ========================================================================
// fill
/**
* Tests {@link BitVector#fill(long)}
*/
@Test
public void fillTest() {
final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
for(int i=0; i<128/*2^7*/; i++) {
vector.setRegister(i, i);
}
vector.fill(0L);
for(int i=0; i<128/*2^7*/; i++) {
assertEquals(vector.getRegister(i), 0);
}
vector.fill(17L/*arbitrary*/);
for(int i=0; i<128/*2^7*/; i++) {
assertEquals(vector.getRegister(i), 17/*arbitrary*/);
}
}
}

View File

@ -0,0 +1,235 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import java.util.HashSet;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import com.carrotsearch.hppc.LongOpenHashSet;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
/**
* Tests {@link HLL} of type {@link HLLType#EXPLICIT}.
*/
public class ExplicitHLLTest extends LuceneTestCase {
/**
* Tests basic set semantics of {@link HLL#addRaw(long)}.
*/
@Test
public void addBasicTest() {
{ // Adding a single positive value to an empty set should work.
final HLL hll = newHLL(128/*arbitrary*/);
hll.addRaw(1L/*positive*/);
assertEquals(hll.cardinality(), 1L);
}
{ // Adding a single negative value to an empty set should work.
final HLL hll = newHLL(128/*arbitrary*/);
hll.addRaw(-1L/*negative*/);
assertEquals(hll.cardinality(), 1L);
}
{ // Adding a duplicate value to a set should be a no-op.
final HLL hll = newHLL(128/*arbitrary*/);
hll.addRaw(1L/*positive*/);
assertEquals(hll.cardinality(), 1L/*arbitrary*/);
assertEquals(hll.cardinality(), 1L/*dupe*/);
}
}
// ------------------------------------------------------------------------
/**
* Tests {@link HLL#union(HLL)}.
*/
@Test
public void unionTest() {
{// Unioning two distinct sets should work
final HLL hllA = newHLL(128/*arbitrary*/);
final HLL hllB = newHLL(128/*arbitrary*/);
hllA.addRaw(1L);
hllA.addRaw(2L);
hllB.addRaw(3L);
hllA.union(hllB);
assertEquals(hllA.cardinality(), 3);
}
{// Unioning two sets whose union doesn't exceed the cardinality cap should not promote
final HLL hllA = newHLL(128/*arbitrary*/);
final HLL hllB = newHLL(128/*arbitrary*/);
hllA.addRaw(1L);
hllA.addRaw(2L);
hllB.addRaw(1L);
hllA.union(hllB);
assertEquals(hllA.cardinality(), 2);
}
{// unioning two sets whose union exceeds the cardinality cap should promote
final HLL hllA = newHLL(128/*arbitrary*/);
final HLL hllB = newHLL(128/*arbitrary*/);
// fill up sets to explicitThreshold
for(long i=0; i<128/*explicitThreshold*/; i++) {
hllA.addRaw(i);
hllB.addRaw(i + 128);
}
hllA.union(hllB);
assertEquals(hllA.getType(), HLLType.SPARSE);
}
}
// ------------------------------------------------------------------------
/**
* Tests {@link HLL#clear()}
*/
@Test
public void clearTest() {
final HLL hll = newHLL(128/*arbitrary*/);
hll.addRaw(1L);
assertEquals(hll.cardinality(), 1L);
hll.clear();
assertEquals(hll.cardinality(), 0L);
}
// ------------------------------------------------------------------------
/**
*/
@Test
public void toFromBytesTest() {
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
final HLLType type = HLLType.EXPLICIT;
final int padding = schemaVersion.paddingBytes(type);
final int bytesPerWord = 8;
{// Should work on an empty set
final HLL hll = newHLL(128/*arbitrary*/);
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output has correct byte length
assertEquals(bytes.length, padding/*no elements, just padding*/);
final HLL inHLL = HLL.fromBytes(bytes);
assertElementsEqual(hll, inHLL);
}
{// Should work on a partially filled set
final HLL hll = newHLL(128/*arbitrary*/);
for(int i=0; i<3; i++) {
hll.addRaw(i);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output has correct byte length
assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/));
final HLL inHLL = HLL.fromBytes(bytes);
assertElementsEqual(hll, inHLL);
}
{// Should work on a full set
final int explicitThreshold = 128;
final HLL hll = newHLL(explicitThreshold);
for(int i=0; i<explicitThreshold; i++) {
hll.addRaw(27 + i/*arbitrary*/);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output has correct byte length
assertEquals(bytes.length, padding + (bytesPerWord * explicitThreshold/*elements*/));
final HLL inHLL = HLL.fromBytes(bytes);
assertElementsEqual(hll, inHLL);
}
}
// ------------------------------------------------------------------------
/**
* Tests correctness against {@link java.util.HashSet}.
*/
@Test
public void randomValuesTest() {
final int explicitThreshold = 4096;
final HashSet<Long> canonical = new HashSet<Long>();
final HLL hll = newHLL(explicitThreshold);
for(int i=0;i<explicitThreshold;i++){
long randomLong = randomLong();
canonical.add(randomLong);
hll.addRaw(randomLong);
}
final int canonicalCardinality = canonical.size();
assertEquals(hll.cardinality(), canonicalCardinality);
}
// ------------------------------------------------------------------------
/**
* Tests promotion to {@link HLLType#SPARSE} and {@link HLLType#FULL}.
*/
@Test
public void promotionTest() {
{ // locally scoped for sanity
final int explicitThreshold = 128;
final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold*/, HLLType.EXPLICIT);
for(int i=0;i<explicitThreshold + 1;i++){
hll.addRaw(i);
}
assertEquals(hll.getType(), HLLType.SPARSE);
}
{ // locally scoped for sanity
final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, 4/*expthresh => explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT);
for(int i=0;i<9/* > explicitThreshold */;i++){
hll.addRaw(i);
}
assertEquals(hll.getType(), HLLType.FULL);
}
}
// ************************************************************************
// assertion helpers
/**
* Asserts that values in both sets are exactly equal.
*/
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
final LongOpenHashSet internalSetA = hllA.explicitStorage;
final LongOpenHashSet internalSetB = hllB.explicitStorage;
assertTrue(internalSetA.equals(internalSetB));
}
/**
* Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified
* explicit threshold.
*
* @param explicitThreshold explicit threshold to use for the constructed
* {@link HLL}. This must be greater than zero.
* @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance.
* This will never be <code>null</code>.
*/
private static HLL newHLL(final int explicitThreshold) {
return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT);
}
}

View File

@ -0,0 +1,341 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
/**
* Tests {@link HLL} of type {@link HLLType#FULL}.
*/
public class FullHLLTest extends LuceneTestCase {
// TODO union test
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the
* small range correction.
*/
@Test
public void smallRangeSmokeTest() {
final int log2m = 11;
final int m = (1 << log2m);
final int regwidth = 5;
// only one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
final long cardinality = hll.cardinality();
// Trivially true that small correction conditions hold: one register
// set implies zeroes exist, and estimator trivially smaller than 5m/2.
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
assertEquals(cardinality, expected);
}
// all but one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<(m - 1); i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
}
// Trivially true that small correction conditions hold: all but
// one register set implies a zero exists, and estimator trivially
// smaller than 5m/2 since it's alpha / ((m-1)/2)
final long cardinality = hll.cardinality();
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the
* uncorrected estimator
*/
@Test
public void normalRangeSmokeTest() {
final int log2m = 11;
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
final int m = (1 << log2m);
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
// all registers at 'medium' value
{
final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for uncorrected range
assertTrue(estimator <= Math.pow(2, l)/30);
assertTrue(estimator > (5 * m /(double)2));
final long expected = (long)Math.ceil(estimator);
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the large
* range correction.
*/
@Test
public void largeRangeSmokeTest() {
final int log2m = 12;
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
final int m = (1 << log2m);
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
{
final int registerValue = 31/*chosen to ensure large correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for large range
assertTrue(estimator > Math.pow(2,l)/30);
// Large range correction: -2^L * log(1 - E/2^L)
final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
assertEquals(cardinality, expected);
}
}
// ========================================================================
/**
* Tests the bounds on a register's value for a given raw input value.
*/
@Test
public void registerValueTest() {
final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
// register width 4 (the minimum size)
{ // scoped locally for sanity
final int regwidth = 4;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
// lower-bounds of the register
hll.addRaw(0x000000000000001L/*'j'=1*/);
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
hll.addRaw(0x0000000000000012L/*'j'=2*/);
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
hll.addRaw(0x0000000000000023L/*'j'=3*/);
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
hll.addRaw(0x0000000000000044L/*'j'=4*/);
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
hll.addRaw(0x0000000000000085L/*'j'=5*/);
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
// upper-bounds of the register
// NOTE: bear in mind that BitVector itself does ensure that
// overflow of a register is prevented
hll.addRaw(0x0000000000010006L/*'j'=6*/);
assertEquals(bitVector.getRegister(6/*'j'*/), 13);
hll.addRaw(0x0000000000020007L/*'j'=7*/);
assertEquals(bitVector.getRegister(7/*'j'*/), 14);
hll.addRaw(0x0000000000040008L/*'j'=8*/);
assertEquals(bitVector.getRegister(8/*'j'*/), 15);
hll.addRaw(0x0000000000080009L/*'j'=9*/);
assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
// sanity checks to ensure that no other bits above the lowest-set
// bit matters
// NOTE: same as case 'j = 6' above
hll.addRaw(0x000000000003000AL/*'j'=10*/);
assertEquals(bitVector.getRegister(10/*'j'*/), 13);
hll.addRaw(0x000000000011000BL/*'j'=11*/);
assertEquals(bitVector.getRegister(11/*'j'*/), 13);
}
// register width 5
{ // scoped locally for sanity
final int regwidth = 5;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
// lower-bounds of the register
hll.addRaw(0x0000000000000001L/*'j'=1*/);
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
hll.addRaw(0x0000000000000012L/*'j'=2*/);
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
hll.addRaw(0x0000000000000023L/*'j'=3*/);
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
hll.addRaw(0x0000000000000044L/*'j'=4*/);
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
hll.addRaw(0x0000000000000085L/*'j'=5*/);
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
// upper-bounds of the register
// NOTE: bear in mind that BitVector itself does ensure that
// overflow of a register is prevented
hll.addRaw(0x0000000100000006L/*'j'=6*/);
assertEquals(bitVector.getRegister(6/*'j'*/), 29);
hll.addRaw(0x0000000200000007L/*'j'=7*/);
assertEquals(bitVector.getRegister(7/*'j'*/), 30);
hll.addRaw(0x0000000400000008L/*'j'=8*/);
assertEquals(bitVector.getRegister(8/*'j'*/), 31);
hll.addRaw(0x0000000800000009L/*'j'=9*/);
assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
}
}
// ========================================================================
/**
* Tests {@link HLL#clear()}.
*/
@Test
public void clearTest() {
final int regwidth = 5;
final int log2m = 4/*16 registers per counter*/;
final int m = 1 << log2m;
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final BitVector bitVector = hll.probabilisticStorage;
for(int i=0; i<m; i++)
bitVector.setRegister(i, i);
hll.clear();
for(int i=0; i<m; i++){
assertEquals(bitVector.getRegister(i), 0L/*default value of register*/);
}
}
// ========================================================================
// Serialization
/**
* Tests {@link HLL#toBytes(ISchemaVersion)} and {@link HLL#fromBytes(byte[])}.
*/
@Test
public void toFromBytesTest() {
final int log2m = 11/*arbitrary*/;
final int regwidth = 5;
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
final HLLType type = HLLType.FULL;
final int padding = schemaVersion.paddingBytes(type);
final int dataByteCount = ProbabilisticTestUtil.getRequiredBytes(regwidth, (1 << log2m)/*aka 2^log2m = m*/);
final int expectedByteCount = padding + dataByteCount;
{// Should work on an empty element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a partially filled element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<3; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a full set
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
for(int i=0; i<(1 << log2m)/*aka 2^log2m*/; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// assert output length is correct
assertEquals(bytes.length, expectedByteCount);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
}
// ************************************************************************
// Assertion Helpers
/**
* Asserts that the two HLLs are register-wise equal.
*/
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
final BitVector bitVectorA = hllA.probabilisticStorage;
final BitVector bitVectorB = hllA.probabilisticStorage;
final LongIterator iterA = bitVectorA.registerIterator();
final LongIterator iterB = bitVectorB.registerIterator();
for(;iterA.hasNext() && iterB.hasNext();) {
assertEquals(iterA.next(), iterB.next());
}
assertFalse(iterA.hasNext());
assertFalse(iterB.hasNext());
}
}

View File

@ -0,0 +1,88 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import static org.apache.solr.util.hll.HLL.*;
/**
* Serialization smoke-tests.
*/
public class HLLSerializationTest extends LuceneTestCase {
/**
* A smoke-test that covers serialization/deserialization of an HLL
* under all possible parameters.
*/
@Test
@Slow
@Nightly
public void serializationSmokeTest() throws Exception {
final Random random = new Random(randomLong());
final int randomCount = 250;
final List<Long> randoms = new ArrayList<Long>(randomCount);
for (int i=0; i<randomCount; i++) {
randoms.add(random.nextLong());
}
assertCardinality(HLLType.EMPTY, randoms);
assertCardinality(HLLType.EXPLICIT, randoms);
assertCardinality(HLLType.SPARSE, randoms);
assertCardinality(HLLType.FULL, randoms);
}
// NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
// completes in a reasonable amount of time. Not much is gained by
// testing larger values - there are no more known serialization
// related edge cases that appear as log2m gets even larger.
// NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
// on 2014-01-30.
private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
throws CloneNotSupportedException {
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
for(final boolean sparse: new boolean[]{true, false}) {
HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
for(final Long item: items) {
hll.addRaw(item);
}
HLL copy = HLL.fromBytes(hll.toBytes());
assertEquals(copy.cardinality(), hll.cardinality());
assertEquals(copy.getType(), hll.getType());
assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
HLL clone = hll.clone();
assertEquals(clone.cardinality(), hll.cardinality());
assertEquals(clone.getType(), hll.getType());
assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
}
}
}
}
}
}

View File

@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
/**
* Tests {@link HLLUtil} static methods.
*
* @author tkarnezo
*/
public class HLLUtilTest extends LuceneTestCase {
/**
* Tests that {@link HLLUtil#largeEstimatorCutoff(int, int)} is the same
* as a trivial implementation.
*/
@Test
public void largeEstimatorCutoffTest() {
for(int log2m=HLL.MINIMUM_LOG2M_PARAM; log2m<=HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
for(int regWidth=HLL.MINIMUM_REGWIDTH_PARAM; regWidth<=HLL.MINIMUM_REGWIDTH_PARAM; regWidth++) {
final double cutoff = HLLUtil.largeEstimatorCutoff(log2m, regWidth);
// See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
// and original paper (Fig. 3) for information on 2^L and
// "large range correction" cutoff.
final double expected = Math.pow(2, Math.pow(2, regWidth) - 2 + log2m) / 30.0;
assertEquals(cutoff, expected, 0.0001);
}
}
}
}

View File

@ -0,0 +1,708 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
import static org.apache.solr.util.hll.ProbabilisticTestUtil.*;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Random;
/**
* Generates test files for testing other implementations of HLL
* serialization/deserialization, namely the PostgreSQL implementation.
*/
public class IntegrationTestGenerator {
// ************************************************************************
// directory to output the generated tests
private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
// ------------------------------------------------------------------------
// configurations for HLLs, should mirror settings in PostgreSQL impl. tests
private static final int REGWIDTH = 5;
private static final int LOG2M = 11;
// NOTE: This differs from the PostgreSQL impl. parameter 'expthresh'. This
// is a literal threshold to use in the promotion hierarchy, implying
// that both EXPLICIT representation should be used and it should
// NOT be automatically computed. This is done to ensure that the
// parameters of the test are very explicitly defined.
private static final int EXPLICIT_THRESHOLD = 256;
// NOTE: This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
// is assumed to be true and this is a literal register-count threshold
// to use in the promotion hierarchy. This is done to ensure that the
// parameters of the test are very explicitly defined.
private static final int SPARSE_THRESHOLD = 850;
// ------------------------------------------------------------------------
// computed constants
private static final int REGISTER_COUNT = (1 << LOG2M);
private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
// ========================================================================
// Tests
/**
* Cumulatively adds random values to a FULL HLL through the small range
* correction, uncorrected range, and large range correction of the HLL's
* cardinality estimator.
*
* Format: cumulative add
* Tests:
* - FULL cardinality computation
*/
private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.FULL);
initLineAdd(output, hll, schemaVersion);
// run through some values in the small range correction
for(int i=0; i<((1 << LOG2M) - 1); i++) {
final long rawValue = constructHLLValue(LOG2M, i, 1);
cumulativeAddLine(output, hll, rawValue, schemaVersion);
}
// run up past some values in the uncorrected range
for(int i=0; i<(1 << LOG2M); i++) {
final long rawValue = constructHLLValue(LOG2M, i, 7);
cumulativeAddLine(output, hll, rawValue, schemaVersion);
}
// run through some values in the large range correction
for(int i=0; i<(1 << LOG2M); i++) {
final long rawValue = constructHLLValue(LOG2M, i, 30);
cumulativeAddLine(output, hll, rawValue, schemaVersion);
}
output.flush();
output.close();
}
/**
* Cumulatively adds random values to an EMPTY HLL.
*
* Format: cumulative add
* Tests:
* - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
* - EMPTY to EXPLICIT promotion
* - EXPLICIT to SPARSE promotion
* - SPARSE to FULL promotion
*/
private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
initLineAdd(output, hll, schemaVersion);
for(int i=0; i<10000/*arbitrary*/; i++) {
cumulativeAddLine(output, hll, randomLong(), schemaVersion);
}
output.flush();
output.close();
}
/**
* Cumulatively unions "underpopulated" FULL HLLs into the
* accumulator to verify the correct behavior from the PostgreSQL implementation.
* The PostgreSQL implementation's representations of probabilistic HLLs should
* depend exclusively on the chosen SPARSE-to-FULL cutoff.
*
* Format: cumulative union
* Tests:
* - EMPTY U "underpopulated" FULL => SPARSE
* - SPARSE U "underpopulated" FULL => SPARSE
* - SPARSE U "barely underpopulated" FULL => FULL
*/
private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
// NOTE: In this test the sparseReference will be the "expected" value
// from the C representation, since it doesn't choose representation
// based on original encoding, but rather on the promotion rules
// and the declared type of the "receiving" field.
// It is the manually-constructed union result.
// "underpopulated" FULL U EMPTY => SPARSE
final HLL fullHLL = newHLL(HLLType.FULL);
fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
final HLL sparseHLL = newHLL(HLLType.SPARSE);
sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
output.flush();
// "underpopulated" FULL (small) U SPARSE (small) => SPARSE
final HLL fullHLL2 = newHLL(HLLType.FULL);
fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
output.flush();
// "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
final HLL fullHLL3 = newHLL(HLLType.FULL);
for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
}
output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
output.flush();
}
/**
* Cumulatively sets successive registers to:
*
* <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
*
* by adding specifically constructed values to a SPARSE HLL.
* Does not induce promotion.
*
* Format: cumulative add
* Tests:
* - SPARSE addition (predictable)
*/
private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
// the accumulator, starts empty sparse probabilistic
final HLL hll = newHLL(HLLType.SPARSE);
initLineAdd(output, hll, schemaVersion);
for(int i=0; i<SPARSE_THRESHOLD; i++) {
final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
cumulativeAddLine(output, hll, rawValue, schemaVersion);
}
output.flush();
output.close();
}
/**
* Cumulatively sets random registers of a SPARSE HLL to
* random values by adding random values. Does not induce promotion.
*
* Format: cumulative add
* Tests:
* - SPARSE addition (random)
*/
private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.SPARSE);
initLineAdd(output, hll, schemaVersion);
for(int i=0; i<SPARSE_THRESHOLD; i++) {
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
cumulativeAddLine(output, hll, rawValue, schemaVersion);
}
output.flush();
output.close();
}
/**
* Cumulatively sets the first register (index 0) to value 2, the last
* register (index m-1) to value 2, and then sets registers with indices in
* the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
*
* This tests for register alignment in the promotion from SPARSE
* to FULL.
*
* Format: cumulative add
* Tests:
* - SPARSE addition
* - SPARSE to FULL promotion
*/
private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.SPARSE);
initLineAdd(output, hll, schemaVersion);
final long firstValue = constructHLLValue(LOG2M, 0, 2);
cumulativeAddLine(output, hll, firstValue, schemaVersion);
final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
cumulativeAddLine(output, hll, lastValue, schemaVersion);
for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
final long middleValue = constructHLLValue(LOG2M, i, 1);
cumulativeAddLine(output, hll, middleValue, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
* single random value.
*
* Format: cumulative union
* Tests:
* - EMPTY U EXPLICIT
* - EXPLICIT U EXPLICIT
* - EXPLICIT to SPARSE promotion
* - SPARSE U EXPLICIT
*/
private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
// make an EXPLICIT set and populate with cardinality 1
final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
explicitHLL.addRaw(random.nextLong());
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with SPARSE HLLs, each
* having one register set.
*
* Format: cumulative union
* Tests:
* - EMPTY U SPARSE
* - SPARSE U SPARSE
* - SPARSE promotion
* - SPARSE U FULL
*/
private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
// make a SPARSE set and populate with cardinality 1
final HLL sparseHLL = newHLL(HLLType.SPARSE);
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
sparseHLL.addRaw(rawValue);
cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
* random value, twice in a row to verify that the set properties are
* satisfied.
*
* Format: cumulative union
* Tests:
* - EMPTY U EXPLICIT
* - EXPLICIT U EXPLICIT
*/
private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
// make an EXPLICIT set and populate with cardinality 1
final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
explicitHLL.addRaw(random.nextLong());
// union it into the accumulator twice, to test overlap (cardinality should not change)
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with SPARSE HLLs, each
* having a single register set, twice in a row to verify that the set
* properties are satisfied.
*
* Format: cumulative union
* Tests:
* - EMPTY U SPARSE
* - SPARSE U SPARSE
*/
private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<SPARSE_THRESHOLD; i++) {
// make a SPARSE set and populate with cardinality 1
final HLL sparseHLL = newHLL(HLLType.SPARSE);
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
sparseHLL.addRaw(rawValue);
cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with FULL HLLs, each having
* many registers set, twice in a row to verify that the set properties are
* satisfied.
*
* Format: cumulative union
* Tests:
* - EMPTY U FULL
* - FULL U FULL
*/
private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
final Random random = new Random(randomLong());
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<1000/*number of rows to generate*/; i++) {
// make a FULL set and populate with
final HLL fullHLL = newHLL(HLLType.FULL);
final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
for(int j=0;j<elementCount;j++) {
fullHLL.addRaw(random.nextLong());
}
cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
}
output.flush();
output.close();
}
/**
* Unions an EMPTY accumulator with random HLLs.
*
* Format: cumulative union
* Tests:
* - hopefully all union possibilities
*/
private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
final FileWriter output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
// the accumulator, starts empty
final HLL hll = newHLL(HLLType.EMPTY);
final HLL emptyHLL = newHLL(HLLType.EMPTY);
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
for(int i=0; i<1000/*number of rows to generate*/; i++) {
final HLL randomHLL = generateRandomHLL();
cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
}
output.flush();
output.close();
}
// ========================================================================
// Main
public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
fullCardinalityCorrectionTest(schemaVersion);
globalUnionTest(schemaVersion);
globalStepTest(schemaVersion);
probabilisticUnionTest(schemaVersion);
explicitPromotionTest(schemaVersion);
explicitOverlapTest(schemaVersion);
sparseFullRepresentationTest(schemaVersion);
sparseStepTest(schemaVersion);
sparseRandomTest(schemaVersion);
sparseEdgeTest(schemaVersion);
sparseProbabilisticPromotionTest(schemaVersion);
sparseProbabilisticOverlapTest(schemaVersion);
}
public static void main(String[] args) throws IOException {
fullSuite(SerializationUtil.VERSION_ONE);
}
// ************************************************************************
// Helpers
/**
* Shortcut for testing constructor, which uses the constants defined at
* the top of the file as default parameters.
*
* @return a new {@link HLL} of specified type, which uses the parameters
* ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
* and {@link #SPARSE_THRESHOLD}) specified above.
*/
private static HLL newHLL(final HLLType type) {
return newHLL(type);
}
/**
* Returns the algorithm-specific cardinality of the specified {@link HLL}
* as a {@link String} appropriate for comparison with the algorithm-specific
* cardinality provided by the PostgreSQL implementation.
*
* @param hll the HLL whose algorithm-specific cardinality is to be printed.
* This cannot be <code>null</code>.
* @return the algorithm-specific cardinality of the instance as a PostgreSQL-
* compatible String. This will never be <code>null</code>
*/
private static String stringCardinality(final HLL hll) {
switch(hll.getType()) {
case EMPTY:
return "0";
case EXPLICIT:/*promotion has not yet occurred*/
return Long.toString(hll.cardinality());
case SPARSE:
return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
case FULL:
return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
default:
throw new RuntimeException("Unknown HLL type " + hll.getType());
}
}
/**
* Generates a random HLL and populates it with random values.
*
* @return the populated HLL. This will never be <code>null</code>.
*/
public static HLL generateRandomHLL() {
final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1);
final HLLType type;
switch(randomTypeInt) {
case 0:
type = HLLType.EMPTY;
break;
case 1:
type = HLLType.EXPLICIT;
break;
case 2:
type = HLLType.FULL;
break;
case 3:
type = HLLType.EMPTY;
break;
case 4:
type = HLLType.SPARSE;
break;
default:
throw new RuntimeException("Unassigned type int " + randomTypeInt);
}
final int cardinalityCap;
final int cardinalityBaseline;
switch(type) {
case EMPTY:
return newHLL(HLLType.EMPTY);
case EXPLICIT:
cardinalityCap = EXPLICIT_THRESHOLD;
cardinalityBaseline = 1;
break;
case SPARSE:
cardinalityCap = SPARSE_THRESHOLD;
cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
break;
case FULL:
cardinalityCap = 100000;
cardinalityBaseline = (SPARSE_THRESHOLD*10);
break;
default:
throw new RuntimeException("We should never be here.");
}
final HLL hll = newHLL(HLLType.EMPTY);
for(int i=0; i<cardinalityBaseline; i++) {
hll.addRaw(randomLong());
}
for(int i=0; i<randomInt(cardinalityCap - cardinalityBaseline); i++) {
hll.addRaw(randomLong());
}
return hll;
}
/**
* Opens a {@link FileWriter} and writes out an appropriate CSV header.
*
* @param schemaVersion Schema version of the output. This cannot be
* <code>null</code>.
* @param description Description string used to build the filename.
* This cannot be <code>null</code>.
* @param type {@link TestType type} of the test file to be written.
* This cannot be <code>null</code>.
* @return The opened {@link FileWriter writer}. This will never be <code>null</code>.
*/
private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
final String header;
final String filename;
switch(type) {
case ADD:
header = "cardinality,raw_value,HLL\n";
filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
break;
case UNION:
header = "cardinality,HLL,union_cardinality,union_HLL\n";
filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
break;
default:
throw new RuntimeException("Unknown test type " + type);
}
final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
output.write(header);
output.flush();
return output;
}
/**
* Writes out a {@link TestType#ADD}-formatted test line.
*
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
* @param rawValue The raw value added to the HLL.
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
* be <code>null</code>.
*/
private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
hll.addRaw(rawValue);
final String accumulatorCardinality = stringCardinality(hll);
output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
output.flush();
}
/**
* Writes an initial line for a {@link TestType#ADD}-formatted test.
*
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
* be <code>null</code>.
*/
private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
output.flush();
}
/**
* Writes out a {@link TestType#UNION}-formatted test line.
*
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
* @param increment The "increment" HLL instance which will be unioned into
* the accumulator. This cannot be <code>null</code>.
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
* be <code>null</code>.
*/
private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
hll.union(increment);
final String incrementCardinality = stringCardinality(increment);
final String accumulatorCardinality = stringCardinality(hll);
output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
output.flush();
}
/**
* Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
*
* @param hll the HLL to serialize. This cannot be <code>null</code>.
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
* be <code>null</code>.
* @return a PostgreSQL 'bytea' string representing the HLL.
*/
private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
final byte[] bytes = hll.toBytes(schemaVersion);
return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
}
/**
* Indicates what kind of test output a test will generate.
*/
private static enum TestType {
/**
* This type of test is characterized by values being added to an
* accumulator HLL whose serialized representation (after the value is added)
* is printed to each line along with the cardinality and added value.
*/
ADD,
/**
* This type of test is characterized by HLLs being unioned into an
* accumulator HLL whose serialized representation (after the HLL is
* union'd) is printed to each line along with the cardinalities and the
* serialized representation of the HLL union'd in.
*/
UNION;
}
}

View File

@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
/**
* A collection of test utilities for constructing input values to HLLs and for
* computing their serialized size.
*/
public class ProbabilisticTestUtil {
/**
* Constructs a value that when added raw to a HLL will set the register at
* <code>registerIndex</code> to <code>registerValue</code>.
*
* @param log2m the log-base-2 of the number of registers in the HLL
* @param registerIndex the index of the register to set
* @param registerValue the value to set the register to
* @return the value
*/
public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
final long partition = registerIndex;
final long substreamValue = (1L << (registerValue - 1));
return (substreamValue << log2m) | partition;
}
/**
* Extracts the HLL register index from a raw value.
*/
public static short getRegisterIndex(final long rawValue, final int log2m) {
final long mBitsMask = (1 << log2m) - 1;
final short j = (short)(rawValue & mBitsMask);
return j;
}
/**
* Extracts the HLL register value from a raw value.
*/
public static byte getRegisterValue(final long rawValue, final int log2m) {
final long substreamValue = (rawValue >>> log2m);
final byte p_w;
if (substreamValue == 0L) {
// The paper does not cover p(0x0), so the special value 0 is used.
// 0 is the original initialization value of the registers, so by
// doing this the HLL simply ignores it. This is acceptable
// because the probability is 1/(2^(2^registerSizeInBits)).
p_w = 0;
} else {
p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
}
return p_w;
}
/**
* @return the number of bytes required to pack <code>registerCount</code>
* registers of width <code>shortWordLength</code>.
*/
public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
}
}

View File

@ -0,0 +1,453 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.util.hll;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
import com.carrotsearch.hppc.IntByteOpenHashMap;
import com.carrotsearch.hppc.cursors.IntByteCursor;
import com.carrotsearch.randomizedtesting.RandomizedTest;
/**
* Tests {@link HLL} of type {@link HLLType#SPARSE}.
*/
public class SparseHLLTest extends LuceneTestCase {
private static final int log2m = 11;
/**
* Tests {@link HLL#addRaw(long)}.
*/
@Test
public void addTest() {
{ // insert an element with register value 1 (minimum set value)
final int registerIndex = 0;
final int registerValue = 1;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
}
{ // insert an element with register value 31 (maximum set value)
final int registerIndex = 0;
final int registerValue = 31;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
}
{ // insert an element that could overflow the register (past 31)
final int registerIndex = 0;
final int registerValue = 36;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
}
{ // insert duplicate elements, observe no change
final int registerIndex = 0;
final int registerValue = 1;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
hll.addRaw(rawValue);
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
}
{ // insert elements that increase a register's value
final int registerIndex = 0;
final int registerValue = 1;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
final int registerValue2 = 2;
final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
hll.addRaw(rawValue2);
assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
}
{ // insert elements that have lower register values, observe no change
final int registerIndex = 0;
final int registerValue = 2;
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(rawValue);
final int registerValue2 = 1;
final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
hll.addRaw(rawValue2);
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the small
* range correction.
*/
@Test
public void smallRangeSmokeTest() {
final int log2m = 11;
final int m = (1 << log2m);
final int regwidth = 5;
// only one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
final long cardinality = hll.cardinality();
// Trivially true that small correction conditions hold: one register
// set implies zeroes exist, and estimator trivially smaller than 5m/2.
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
assertEquals(cardinality, expected);
}
// all but one register set
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
for(int i=0; i<(m - 1); i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
}
// Trivially true that small correction conditions hold: all but
// one register set implies a zero exists, and estimator trivially
// smaller than 5m/2 since it's alpha / ((m-1)/2)
final long cardinality = hll.cardinality();
// Small range correction: m * log(m/V)
final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the
* uncorrected estimator.
*/
@Test
public void normalRangeSmokeTest() {
final int log2m = 11;
final int m = (1 << log2m);
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
// all registers at 'medium' value
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for uncorrected range
assertTrue(estimator <= Math.pow(2,l)/30);
assertTrue(estimator > (5 * m /(double)2));
final long expected = (long)Math.ceil(estimator);
assertEquals(cardinality, expected);
}
}
/**
* Smoke test for {@link HLL#cardinality()} and the proper use of the large
* range correction.
*/
@Test
public void largeRangeSmokeTest() {
final int log2m = 11;
final int m = (1 << log2m);
final int regwidth = 5;
// regwidth = 5, so hash space is
// log2m + (2^5 - 1 - 1), so L = log2m + 30
final int l = log2m + 30;
// all registers at large value
{
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
final int registerValue = 31/*chosen to ensure large correction kicks in*/;
for(int i=0; i<m; i++) {
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
}
final long cardinality = hll.cardinality();
// Simplified estimator when all registers take same value: alpha / (m/2^val)
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
// Assert conditions for large range
assertTrue(estimator > Math.pow(2, l)/30);
// Large range correction: -2^32 * log(1 - E/2^32)
final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
assertEquals(cardinality, expected);
}
}
/**
* Tests {@link HLL#union(HLL)}.
*/
@Test
public void unionTest() {
final int log2m = 11/*arbitrary*/;
final int sparseThreshold = 256/*arbitrary*/;
{ // two empty multisets should union to an empty set
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
hllA.union(hllB);
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
assertEquals(hllA.cardinality(), 0L);
}
{ // two disjoint multisets should union properly
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
hllA.union(hllB);
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
assertEquals(hllA.cardinality(), 3L/*precomputed*/);
assertRegisterPresent(hllA, 1, (byte)1);
assertRegisterPresent(hllA, 2, (byte)1);
}
{ // two exactly overlapping multisets should union properly
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
hllA.union(hllB);
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
assertEquals(hllA.cardinality(), 2L/*precomputed*/);
assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
}
{ // overlapping multisets should union properly
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
// register index = 3
final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
// register index = 4
final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
// register index = 5
final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
hllA.addRaw(rawValueA);
hllA.addRaw(rawValueB);
hllB.addRaw(rawValueBPrime);
hllB.addRaw(rawValueC);
hllA.union(hllB);
// union should have three registers set, with partition B set to the
// max of the two registers
assertRegisterPresent(hllA, 3, (byte)11);
assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
assertRegisterPresent(hllA, 5, (byte)14);
}
{ // too-large unions should promote
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
// fill up sets to maxCapacity
for(int i=0; i<sparseThreshold; i++) {
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
}
hllA.union(hllB);
assertEquals(hllA.getType(), HLLType.FULL);
}
}
/**
* Tests {@link HLL#clear()}.
*/
@Test
public void clearTest() {
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
hll.addRaw(1L);
hll.clear();
assertEquals(hll.cardinality(), 0L);
}
/**
* Tests {@link HLL#toBytes(ISchemaVersion)} and
* {@link HLL#fromBytes(byte[])}.
*/
@Test
public void toFromBytesTest() {
final int log2m = 11/*arbitrary*/;
final int regwidth = 5/*arbitrary*/;
final int sparseThreshold = 256/*arbitrary*/;
final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
final HLLType type = HLLType.SPARSE;
final int padding = schemaVersion.paddingBytes(type);
{// Should work on an empty element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
final byte[] bytes = hll.toBytes(schemaVersion);
// output should just be padding since no registers are used
assertEquals(bytes.length, padding);
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a partially filled element
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
for(int i=0; i<3; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
{// Should work on a full set
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
for(int i=0; i<sparseThreshold; i++) {
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
hll.addRaw(rawValue);
}
final byte[] bytes = hll.toBytes(schemaVersion);
// 'short words' should be 12 bits + 5 bits = 17 bits long
assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
final HLL inHLL = HLL.fromBytes(bytes);
// assert register values correct
assertElementsEqual(hll, inHLL);
}
}
/**
* Smoke tests the multisets by adding random values.
*/
@Test
public void randomValuesTest() {
final int log2m = 11/*arbitrary*/;
final int regwidth = 5/*arbitrary*/;
final int sparseThreshold = 256/*arbitrary*/;
for(int run=0; run<100; run++) {
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
final IntByteOpenHashMap map = new IntByteOpenHashMap();
for(int i=0; i<sparseThreshold; i++) {
final long rawValue = RandomizedTest.randomLong();
final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
if(map.get(registerIndex) < registerValue) {
map.put(registerIndex, registerValue);
}
hll.addRaw(rawValue);
}
for (IntByteCursor c : map) {
final byte expectedRegisterValue = map.get(c.key);
assertRegisterPresent(hll, c.key, expectedRegisterValue);
}
}
}
//*************************************************************************
// assertion helpers
/**
* Asserts that the register at the specified index is set to the specified
* value.
*/
private static void assertRegisterPresent(final HLL hll,
final int registerIndex,
final int registerValue) {
final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
}
/**
* Asserts that only the specified register is set and has the specified value.
*/
private static void assertOneRegisterSet(final HLL hll,
final int registerIndex,
final byte registerValue) {
final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
assertEquals(sparseProbabilisticStorage.size(), 1);
assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
}
/**
* Asserts that all registers in the two {@link HLL} instances are identical.
*/
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
final IntByteOpenHashMap sparseProbabilisticStorageA = hllA.sparseProbabilisticStorage;
final IntByteOpenHashMap sparseProbabilisticStorageB = hllB.sparseProbabilisticStorage;
assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
for (IntByteCursor c : sparseProbabilisticStorageA) {
assertEquals(sparseProbabilisticStorageA.get(c.key),
sparseProbabilisticStorageB.get(c.key));
}
}
}