mirror of https://github.com/apache/lucene.git
Ported tests as well.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr7787@1691351 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1842589815
commit
139460e8c5
|
@ -0,0 +1,191 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
|
||||
/**
|
||||
* Unit and smoke tests for {@link BigEndianAscendingWordDeserializer}.
|
||||
*
|
||||
* @author timon
|
||||
*/
|
||||
public class BigEndianAscendingWordDeserializerTest extends LuceneTestCase {
|
||||
/**
|
||||
* Error checking tests for constructor.
|
||||
*/
|
||||
@Test
|
||||
public void constructorErrorTest() {
|
||||
// word length too small
|
||||
try {
|
||||
new BigEndianAscendingWordDeserializer(0/*wordLength, below minimum of 1*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
|
||||
fail("Should complain about too-short words.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Word length must be"));
|
||||
}
|
||||
|
||||
// word length too large
|
||||
try {
|
||||
new BigEndianAscendingWordDeserializer(65/*wordLength, above maximum of 64*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
|
||||
fail("Should complain about too-long words.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Word length must be"));
|
||||
}
|
||||
|
||||
// byte padding negative
|
||||
try {
|
||||
new BigEndianAscendingWordDeserializer(5/*wordLength, arbitrary*/, -1/*bytePadding, too small*/, new byte[1]/*bytes, arbitrary, not used here*/);
|
||||
fail("Should complain about negative byte padding.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Byte padding must be"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test using 64-bit short words and special word values.
|
||||
*/
|
||||
@Test
|
||||
public void smokeTest64BitWord() {
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(64/*wordLength*/,
|
||||
5/*wordCount*/,
|
||||
0/*bytePadding, arbitrary*/);
|
||||
|
||||
// Check that the sign bit is being preserved.
|
||||
serializer.writeWord(-1L);
|
||||
serializer.writeWord(-112894714L);
|
||||
|
||||
// Check "special" values
|
||||
serializer.writeWord(0L);
|
||||
serializer.writeWord(Long.MAX_VALUE);
|
||||
serializer.writeWord(Long.MIN_VALUE);
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
|
||||
final BigEndianAscendingWordDeserializer deserializer =
|
||||
new BigEndianAscendingWordDeserializer(64/*wordLength*/, 0/*bytePadding*/, bytes);
|
||||
|
||||
assertEquals(deserializer.totalWordCount(), 5/*wordCount*/);
|
||||
|
||||
assertEquals(deserializer.readWord(), -1L);
|
||||
assertEquals(deserializer.readWord(), -112894714L);
|
||||
assertEquals(deserializer.readWord(), 0L);
|
||||
assertEquals(deserializer.readWord(), Long.MAX_VALUE);
|
||||
assertEquals(deserializer.readWord(), Long.MIN_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* A smoke/fuzz test for ascending (from zero) word values.
|
||||
*/
|
||||
@Test
|
||||
public void ascendingSmokeTest() {
|
||||
for(int wordLength=5; wordLength<65; wordLength++) {
|
||||
runAscendingTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* A smoke/fuzz test for random word values.
|
||||
*/
|
||||
@Test
|
||||
public void randomSmokeTest() {
|
||||
for(int wordLength=5; wordLength<65; wordLength++) {
|
||||
runRandomTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
* Runs a test which serializes and deserializes random word values.
|
||||
*
|
||||
* @param wordLength the length of words to test
|
||||
* @param bytePadding the number of bytes padding the byte array
|
||||
* @param wordCount the number of word values to test
|
||||
*/
|
||||
private static void runRandomTest(final int wordLength, final int bytePadding, final int wordCount) {
|
||||
final long seed = randomLong();
|
||||
final Random random = new Random(seed);
|
||||
final Random verificationRandom = new Random(seed);
|
||||
|
||||
final long wordMask;
|
||||
if(wordLength == 64) {
|
||||
wordMask = ~0L;
|
||||
} else {
|
||||
wordMask = (1L << wordLength) - 1L;
|
||||
}
|
||||
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
|
||||
wordCount,
|
||||
bytePadding/*bytePadding, arbitrary*/);
|
||||
|
||||
for(int i=0; i<wordCount; i++) {
|
||||
final long value = random.nextLong() & wordMask;
|
||||
serializer.writeWord(value);
|
||||
}
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
|
||||
final BigEndianAscendingWordDeserializer deserializer =
|
||||
new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
|
||||
|
||||
assertEquals(deserializer.totalWordCount(), wordCount);
|
||||
for(int i=0; i<wordCount; i++) {
|
||||
assertEquals(deserializer.readWord(), (verificationRandom.nextLong() & wordMask));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs a test which serializes and deserializes ascending (from zero) word values.
|
||||
*
|
||||
* @param wordLength the length of words to test
|
||||
* @param bytePadding the number of bytes padding the byte array
|
||||
* @param wordCount the number of word values to test
|
||||
*/
|
||||
private static void runAscendingTest(final int wordLength, final int bytePadding, final int wordCount) {
|
||||
final long wordMask;
|
||||
if(wordLength == 64) {
|
||||
wordMask = ~0L;
|
||||
} else {
|
||||
wordMask = (1L << wordLength) - 1L;
|
||||
}
|
||||
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
|
||||
wordCount,
|
||||
bytePadding/*bytePadding, arbitrary*/);
|
||||
|
||||
for(long i=0; i<wordCount; i++) {
|
||||
serializer.writeWord(i & wordMask);
|
||||
}
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
|
||||
final BigEndianAscendingWordDeserializer deserializer =
|
||||
new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
|
||||
|
||||
assertEquals(deserializer.totalWordCount(), wordCount);
|
||||
for(long i=0; i<wordCount; i++) {
|
||||
assertEquals(deserializer.readWord(), i & wordMask);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,337 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link BigEndianAscendingWordSerializer}.
|
||||
*/
|
||||
public class BigEndianAscendingWordSerializerTest extends LuceneTestCase {
|
||||
/**
|
||||
* Error checking tests for constructor.
|
||||
*/
|
||||
@Test
|
||||
public void constructorErrorTest() {
|
||||
// word length too small
|
||||
try {
|
||||
new BigEndianAscendingWordSerializer(0/*wordLength, below minimum of 1*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
|
||||
fail("Should complain about too-short words.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Word length must be"));
|
||||
}
|
||||
|
||||
// word length too large
|
||||
try {
|
||||
new BigEndianAscendingWordSerializer(65/*wordLength, above max of 64*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
|
||||
fail("Should complain about too-long words.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Word length must be"));
|
||||
}
|
||||
|
||||
// word count negative
|
||||
try {
|
||||
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, -1/*wordCount, too small*/, 0/*bytePadding, arbitrary*/);
|
||||
fail("Should complain about negative word count.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Word count must be"));
|
||||
}
|
||||
|
||||
// byte padding negative
|
||||
try {
|
||||
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, 1/*wordCount, arbitrary*/, -1/*bytePadding, too small*/);
|
||||
fail("Should complain about negative byte padding.");
|
||||
} catch(final IllegalArgumentException e) {
|
||||
assertTrue(e.getMessage().contains("Byte padding must be"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests runtime exception thrown at premature call to {@link BigEndianAscendingWordSerializer#getBytes()}.
|
||||
*/
|
||||
@Test
|
||||
public void earlyGetBytesTest() {
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/,
|
||||
1/*wordCount*/,
|
||||
0/*bytePadding, arbitrary*/);
|
||||
|
||||
// getBytes without enough writeWord should throw
|
||||
try {
|
||||
serializer.getBytes();
|
||||
fail("Should throw.");
|
||||
} catch(final RuntimeException e) {
|
||||
assertTrue(e.getMessage().contains("Not all words"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
@Test
|
||||
public void smokeTestExplicitParams() {
|
||||
final int shortWordLength = 64/*longs used in LongSetSlab*/;
|
||||
|
||||
{// Should work on an empty sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
0/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
|
||||
}
|
||||
{// Should work on a byte-divisible sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
2/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
serializer.writeWord(0xBAAAAAAAAAAAAAACL);
|
||||
serializer.writeWord(0x8FFFFFFFFFFFFFF1L);
|
||||
|
||||
// Bytes:
|
||||
// ======
|
||||
// 0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC
|
||||
// 0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1
|
||||
//
|
||||
// -70 -86 ... -84
|
||||
// -113 -1 ... -15
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { -70, -86, -86, -86, -86, -86, -86, -84,
|
||||
-113, -1, -1, -1, -1, -1, -1, -15 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
{// Should pad the array correctly.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
1/*wordCount*/,
|
||||
1/*bytePadding*/);
|
||||
|
||||
serializer.writeWord(1);
|
||||
// 1 byte leading padding | value 1 | trailing padding
|
||||
// 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001
|
||||
// 0x00 | 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x01
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for typical parameters used in practice.
|
||||
*/
|
||||
@Test
|
||||
public void smokeTestProbabilisticParams() {
|
||||
// XXX: revisit this
|
||||
final int shortWordLength = 5;
|
||||
{// Should work on an empty sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
0/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
|
||||
}
|
||||
{// Should work on a non-byte-divisible sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
3/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
serializer.writeWord(9);
|
||||
serializer.writeWord(31);
|
||||
serializer.writeWord(1);
|
||||
|
||||
// The values:
|
||||
// -----------
|
||||
// 9 |31 |1 |padding
|
||||
|
||||
// Corresponding bits:
|
||||
// ------------------
|
||||
// 0100 1|111 11|00 001|0
|
||||
|
||||
// And the hex/decimal (remember Java bytes are signed):
|
||||
// -----------------------------------------------------
|
||||
// 0100 1111 -> 0x4F -> 79
|
||||
// 1100 0010 -> 0xC2 -> -62
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 79, -62 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
{// Should work on a byte-divisible sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
8/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
for(int i=1; i<9; i++) {
|
||||
serializer.writeWord(i);
|
||||
}
|
||||
|
||||
// Values: 1-8
|
||||
// Corresponding bits:
|
||||
// ------------------
|
||||
// 00001
|
||||
// 00010
|
||||
// 00011
|
||||
// 00100
|
||||
// 00101
|
||||
// 00110
|
||||
// 00111
|
||||
// 01000
|
||||
|
||||
// And the hex:
|
||||
// ------------
|
||||
// 0000 1000 => 0x08 => 8
|
||||
// 1000 0110 => 0x86 => -122
|
||||
// 0100 0010 => 0x62 => 66
|
||||
// 1001 1000 => 0x98 => -104
|
||||
// 1110 1000 => 0xE8 => -24
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
{// Should pad the array correctly.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
1/*wordCount*/,
|
||||
1/*bytePadding*/);
|
||||
|
||||
serializer.writeWord(1);
|
||||
// 1 byte leading padding | value 1 | trailing padding
|
||||
// 0000 0000 | 0000 1|000
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 0, 8 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for typical parameters used in practice.
|
||||
*/
|
||||
@Test
|
||||
public void smokeTestSparseParams() {
|
||||
// XXX: revisit
|
||||
final int shortWordLength = 17;
|
||||
{// Should work on an empty sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
0/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
assert(Arrays.equals(serializer.getBytes(), new byte[0]));
|
||||
}
|
||||
{// Should work on a non-byte-divisible sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
3/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
serializer.writeWord(9);
|
||||
serializer.writeWord(42);
|
||||
serializer.writeWord(75);
|
||||
|
||||
// The values:
|
||||
// -----------
|
||||
// 9 |42 |75 |padding
|
||||
|
||||
// Corresponding bits:
|
||||
// ------------------
|
||||
// 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
|
||||
|
||||
// And the hex/decimal (remember Java bytes are signed):
|
||||
// -----------------------------------------------------
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0000 0100 -> 0x04 -> 4
|
||||
// 1000 0000 -> 0x80 -> -128
|
||||
// 0000 1010 -> 0x0A -> 10
|
||||
// 1000 0000 -> 0x80 -> -128
|
||||
// 0000 1001 -> 0x09 -> 9
|
||||
// 0110 0000 -> 0x60 -> 96
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
{// Should work on a byte-divisible sequence, with no padding.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
8/*wordCount*/,
|
||||
0/*bytePadding, none*/);
|
||||
|
||||
for(int i=1; i<9; i++) {
|
||||
serializer.writeWord(i);
|
||||
}
|
||||
|
||||
// Values: 1-8
|
||||
// Corresponding bits:
|
||||
// ------------------
|
||||
// 0000 0000 0000 0000 1
|
||||
// 000 0000 0000 0000 10
|
||||
// 00 0000 0000 0000 011
|
||||
// 0 0000 0000 0000 0100
|
||||
|
||||
// 0000 0000 0000 0010 1
|
||||
// 000 0000 0000 0001 10
|
||||
// 00 0000 0000 0000 111
|
||||
// 0 0000 0000 0000 1000
|
||||
|
||||
// And the hex:
|
||||
// ------------
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 1000 0000 -> 0x80 -> -128
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 1000 0000 -> 0x80 -> -128
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0110 0000 -> 0x60 -> 96
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0100 0000 -> 0x40 -> 64
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0010 1000 -> 0x28 -> 40
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0001 1000 -> 0x18 -> 24
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0000 1110 -> 0x0D -> 14
|
||||
// 0000 0000 -> 0x00 -> 0
|
||||
// 0000 1000 -> 0x08 -> 8
|
||||
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
{// Should pad the array correctly.
|
||||
final BigEndianAscendingWordSerializer serializer =
|
||||
new BigEndianAscendingWordSerializer(shortWordLength,
|
||||
1/*wordCount*/,
|
||||
1/*bytePadding*/);
|
||||
|
||||
serializer.writeWord(1);
|
||||
// 1 byte leading padding | value 1 | trailing padding
|
||||
// 0000 0000 | 0000 0000 0000 0000 1|000 0000
|
||||
// 0x00 0x00 0x00 0x80
|
||||
final byte[] bytes = serializer.getBytes();
|
||||
final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 };
|
||||
assertTrue(Arrays.equals(bytes, expectedBytes));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link BitVector}.
|
||||
*/
|
||||
public class BitVectorTest extends LuceneTestCase {
|
||||
/**
|
||||
* Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}.
|
||||
*/
|
||||
@Test
|
||||
public void getSetRegisterTest() {
|
||||
{ // locally scoped for sanity
|
||||
// NOTE: registers are only 5bits wide
|
||||
final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
vector1.setRegister(i, 0x1F);
|
||||
vector2.setRegister(i, (i & 0x1F));
|
||||
vector3.setRegister(i, ((127 - i) & 0x1F));
|
||||
vector4.setRegister(i, 0x15);
|
||||
}
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
assertEquals(vector1.getRegister(i), 0x1F);
|
||||
assertEquals(vector2.getRegister(i), (i & 0x1F));
|
||||
assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F));
|
||||
assertEquals(vector4.getRegister(i), 0x15);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
/**
|
||||
* Tests {@link BitVector#registerIterator()}
|
||||
*/
|
||||
@Test
|
||||
public void registerIteratorTest() {
|
||||
{ // scoped locally for sanity
|
||||
// NOTE: registers are only 5bits wide
|
||||
final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
vector1.setRegister(i, 0x1F);
|
||||
vector2.setRegister(i, (i & 0x1F));
|
||||
vector3.setRegister(i, ((127 - i) & 0x1F));
|
||||
vector4.setRegister(i, 0x15);
|
||||
}
|
||||
|
||||
final LongIterator registerIterator1 = vector1.registerIterator();
|
||||
final LongIterator registerIterator2 = vector2.registerIterator();
|
||||
final LongIterator registerIterator3 = vector3.registerIterator();
|
||||
final LongIterator registerIterator4 = vector4.registerIterator();
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
assertEquals(registerIterator1.hasNext(), true);
|
||||
assertEquals(registerIterator2.hasNext(), true);
|
||||
assertEquals(registerIterator3.hasNext(), true);
|
||||
assertEquals(registerIterator4.hasNext(), true);
|
||||
|
||||
assertEquals(registerIterator1.next(), 0x1F);
|
||||
assertEquals(registerIterator2.next(), (i & 0x1F));
|
||||
assertEquals(registerIterator3.next(), ((127 - i) & 0x1F));
|
||||
assertEquals(registerIterator4.next(), 0x15);
|
||||
}
|
||||
assertEquals(registerIterator1.hasNext(), false/*no more*/);
|
||||
assertEquals(registerIterator2.hasNext(), false/*no more*/);
|
||||
assertEquals(registerIterator3.hasNext(), false/*no more*/);
|
||||
assertEquals(registerIterator4.hasNext(), false/*no more*/);
|
||||
}
|
||||
|
||||
{ // scoped locally for sanity
|
||||
// Vectors that are shorter than one word
|
||||
assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */);
|
||||
assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */);
|
||||
assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */);
|
||||
assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */);
|
||||
|
||||
// Vectors that don't fit exactly into longs
|
||||
assertIterator(5, 16/* 5*16=80 bits */);
|
||||
assertIterator(5, 32/* 5*32=160 bits */);
|
||||
}
|
||||
|
||||
// Iterate over vectors that are padded
|
||||
}
|
||||
|
||||
private static void assertIterator(final int width, final int count) {
|
||||
final BitVector vector = new BitVector(width, count);
|
||||
final LongIterator iter = vector.registerIterator();
|
||||
|
||||
for(int i=0; i<count; i++) {
|
||||
assertTrue(String.format("expected more elements: width=%s, count=%s", width, count), iter.hasNext());
|
||||
// TODO: fill with a sentinel value
|
||||
assertEquals(iter.next(), 0);
|
||||
}
|
||||
assertFalse(String.format("expected no more elements: width=%s, count=%s", width, count), iter.hasNext());
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
/**
|
||||
* Tests {@link BitVector#setMaxRegister(long, long)}
|
||||
*/
|
||||
@Test
|
||||
public void setMaxRegisterTest() {
|
||||
final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
|
||||
vector.setRegister(0, 10);
|
||||
// should replace with a larger value
|
||||
vector.setMaxRegister(0, 11);
|
||||
assertEquals(vector.getRegister(0), 11);
|
||||
// should not replace with a smaller or equal value
|
||||
vector.setMaxRegister(0, 9);
|
||||
assertEquals(vector.getRegister( 0), 11);
|
||||
vector.setMaxRegister(0, 11);
|
||||
assertEquals(vector.getRegister(0), 11);
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// fill
|
||||
/**
|
||||
* Tests {@link BitVector#fill(long)}
|
||||
*/
|
||||
@Test
|
||||
public void fillTest() {
|
||||
final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
vector.setRegister(i, i);
|
||||
}
|
||||
|
||||
vector.fill(0L);
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
assertEquals(vector.getRegister(i), 0);
|
||||
}
|
||||
|
||||
vector.fill(17L/*arbitrary*/);
|
||||
|
||||
for(int i=0; i<128/*2^7*/; i++) {
|
||||
assertEquals(vector.getRegister(i), 17/*arbitrary*/);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,235 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.carrotsearch.hppc.LongOpenHashSet;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
|
||||
|
||||
/**
|
||||
* Tests {@link HLL} of type {@link HLLType#EXPLICIT}.
|
||||
*/
|
||||
public class ExplicitHLLTest extends LuceneTestCase {
|
||||
/**
|
||||
* Tests basic set semantics of {@link HLL#addRaw(long)}.
|
||||
*/
|
||||
@Test
|
||||
public void addBasicTest() {
|
||||
{ // Adding a single positive value to an empty set should work.
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
hll.addRaw(1L/*positive*/);
|
||||
assertEquals(hll.cardinality(), 1L);
|
||||
}
|
||||
{ // Adding a single negative value to an empty set should work.
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
hll.addRaw(-1L/*negative*/);
|
||||
assertEquals(hll.cardinality(), 1L);
|
||||
}
|
||||
{ // Adding a duplicate value to a set should be a no-op.
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
hll.addRaw(1L/*positive*/);
|
||||
assertEquals(hll.cardinality(), 1L/*arbitrary*/);
|
||||
assertEquals(hll.cardinality(), 1L/*dupe*/);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
* Tests {@link HLL#union(HLL)}.
|
||||
*/
|
||||
@Test
|
||||
public void unionTest() {
|
||||
{// Unioning two distinct sets should work
|
||||
final HLL hllA = newHLL(128/*arbitrary*/);
|
||||
final HLL hllB = newHLL(128/*arbitrary*/);
|
||||
hllA.addRaw(1L);
|
||||
hllA.addRaw(2L);
|
||||
hllB.addRaw(3L);
|
||||
|
||||
hllA.union(hllB);
|
||||
assertEquals(hllA.cardinality(), 3);
|
||||
}
|
||||
{// Unioning two sets whose union doesn't exceed the cardinality cap should not promote
|
||||
final HLL hllA = newHLL(128/*arbitrary*/);
|
||||
final HLL hllB = newHLL(128/*arbitrary*/);
|
||||
hllA.addRaw(1L);
|
||||
hllA.addRaw(2L);
|
||||
hllB.addRaw(1L);
|
||||
|
||||
hllA.union(hllB);
|
||||
assertEquals(hllA.cardinality(), 2);
|
||||
}
|
||||
{// unioning two sets whose union exceeds the cardinality cap should promote
|
||||
final HLL hllA = newHLL(128/*arbitrary*/);
|
||||
final HLL hllB = newHLL(128/*arbitrary*/);
|
||||
|
||||
// fill up sets to explicitThreshold
|
||||
for(long i=0; i<128/*explicitThreshold*/; i++) {
|
||||
hllA.addRaw(i);
|
||||
hllB.addRaw(i + 128);
|
||||
}
|
||||
|
||||
hllA.union(hllB);
|
||||
assertEquals(hllA.getType(), HLLType.SPARSE);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
* Tests {@link HLL#clear()}
|
||||
*/
|
||||
@Test
|
||||
public void clearTest() {
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
hll.addRaw(1L);
|
||||
assertEquals(hll.cardinality(), 1L);
|
||||
hll.clear();
|
||||
assertEquals(hll.cardinality(), 0L);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
*/
|
||||
@Test
|
||||
public void toFromBytesTest() {
|
||||
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
|
||||
final HLLType type = HLLType.EXPLICIT;
|
||||
final int padding = schemaVersion.paddingBytes(type);
|
||||
final int bytesPerWord = 8;
|
||||
|
||||
{// Should work on an empty set
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output has correct byte length
|
||||
assertEquals(bytes.length, padding/*no elements, just padding*/);
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a partially filled set
|
||||
final HLL hll = newHLL(128/*arbitrary*/);
|
||||
|
||||
for(int i=0; i<3; i++) {
|
||||
hll.addRaw(i);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output has correct byte length
|
||||
assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/));
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a full set
|
||||
final int explicitThreshold = 128;
|
||||
final HLL hll = newHLL(explicitThreshold);
|
||||
|
||||
for(int i=0; i<explicitThreshold; i++) {
|
||||
hll.addRaw(27 + i/*arbitrary*/);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output has correct byte length
|
||||
assertEquals(bytes.length, padding + (bytesPerWord * explicitThreshold/*elements*/));
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
* Tests correctness against {@link java.util.HashSet}.
|
||||
*/
|
||||
@Test
|
||||
public void randomValuesTest() {
|
||||
final int explicitThreshold = 4096;
|
||||
final HashSet<Long> canonical = new HashSet<Long>();
|
||||
final HLL hll = newHLL(explicitThreshold);
|
||||
|
||||
for(int i=0;i<explicitThreshold;i++){
|
||||
long randomLong = randomLong();
|
||||
canonical.add(randomLong);
|
||||
hll.addRaw(randomLong);
|
||||
}
|
||||
final int canonicalCardinality = canonical.size();
|
||||
assertEquals(hll.cardinality(), canonicalCardinality);
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
/**
|
||||
* Tests promotion to {@link HLLType#SPARSE} and {@link HLLType#FULL}.
|
||||
*/
|
||||
@Test
|
||||
public void promotionTest() {
|
||||
{ // locally scoped for sanity
|
||||
final int explicitThreshold = 128;
|
||||
final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold*/, HLLType.EXPLICIT);
|
||||
|
||||
for(int i=0;i<explicitThreshold + 1;i++){
|
||||
hll.addRaw(i);
|
||||
}
|
||||
assertEquals(hll.getType(), HLLType.SPARSE);
|
||||
}
|
||||
{ // locally scoped for sanity
|
||||
final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, 4/*expthresh => explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT);
|
||||
|
||||
for(int i=0;i<9/* > explicitThreshold */;i++){
|
||||
hll.addRaw(i);
|
||||
}
|
||||
assertEquals(hll.getType(), HLLType.FULL);
|
||||
}
|
||||
}
|
||||
|
||||
// ************************************************************************
|
||||
// assertion helpers
|
||||
/**
|
||||
* Asserts that values in both sets are exactly equal.
|
||||
*/
|
||||
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
|
||||
final LongOpenHashSet internalSetA = hllA.explicitStorage;
|
||||
final LongOpenHashSet internalSetB = hllB.explicitStorage;
|
||||
|
||||
assertTrue(internalSetA.equals(internalSetB));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified
|
||||
* explicit threshold.
|
||||
*
|
||||
* @param explicitThreshold explicit threshold to use for the constructed
|
||||
* {@link HLL}. This must be greater than zero.
|
||||
* @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance.
|
||||
* This will never be <code>null</code>.
|
||||
*/
|
||||
private static HLL newHLL(final int explicitThreshold) {
|
||||
return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,341 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests {@link HLL} of type {@link HLLType#FULL}.
|
||||
*/
|
||||
public class FullHLLTest extends LuceneTestCase {
|
||||
// TODO union test
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the
|
||||
* small range correction.
|
||||
*/
|
||||
@Test
|
||||
public void smallRangeSmokeTest() {
|
||||
final int log2m = 11;
|
||||
final int m = (1 << log2m);
|
||||
final int regwidth = 5;
|
||||
|
||||
// only one register set
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
// Trivially true that small correction conditions hold: one register
|
||||
// set implies zeroes exist, and estimator trivially smaller than 5m/2.
|
||||
// Small range correction: m * log(m/V)
|
||||
final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
|
||||
// all but one register set
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
for(int i=0; i<(m - 1); i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
|
||||
}
|
||||
|
||||
// Trivially true that small correction conditions hold: all but
|
||||
// one register set implies a zero exists, and estimator trivially
|
||||
// smaller than 5m/2 since it's alpha / ((m-1)/2)
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
// Small range correction: m * log(m/V)
|
||||
final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the
|
||||
* uncorrected estimator
|
||||
*/
|
||||
@Test
|
||||
public void normalRangeSmokeTest() {
|
||||
final int log2m = 11;
|
||||
final int regwidth = 5;
|
||||
// regwidth = 5, so hash space is
|
||||
// log2m + (2^5 - 1 - 1), so L = log2m + 30
|
||||
final int l = log2m + 30;
|
||||
final int m = (1 << log2m);
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
|
||||
// all registers at 'medium' value
|
||||
{
|
||||
final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
|
||||
for(int i=0; i<m; i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
|
||||
}
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
|
||||
// Simplified estimator when all registers take same value: alpha / (m/2^val)
|
||||
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
|
||||
|
||||
// Assert conditions for uncorrected range
|
||||
assertTrue(estimator <= Math.pow(2, l)/30);
|
||||
assertTrue(estimator > (5 * m /(double)2));
|
||||
|
||||
final long expected = (long)Math.ceil(estimator);
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the large
|
||||
* range correction.
|
||||
*/
|
||||
@Test
|
||||
public void largeRangeSmokeTest() {
|
||||
final int log2m = 12;
|
||||
final int regwidth = 5;
|
||||
// regwidth = 5, so hash space is
|
||||
// log2m + (2^5 - 1 - 1), so L = log2m + 30
|
||||
final int l = log2m + 30;
|
||||
final int m = (1 << log2m);
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
|
||||
{
|
||||
final int registerValue = 31/*chosen to ensure large correction kicks in*/;
|
||||
for(int i=0; i<m; i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
|
||||
}
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
|
||||
// Simplified estimator when all registers take same value: alpha / (m/2^val)
|
||||
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
|
||||
|
||||
// Assert conditions for large range
|
||||
|
||||
assertTrue(estimator > Math.pow(2,l)/30);
|
||||
|
||||
// Large range correction: -2^L * log(1 - E/2^L)
|
||||
final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
/**
|
||||
* Tests the bounds on a register's value for a given raw input value.
|
||||
*/
|
||||
@Test
|
||||
public void registerValueTest() {
|
||||
final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
|
||||
|
||||
// register width 4 (the minimum size)
|
||||
{ // scoped locally for sanity
|
||||
final int regwidth = 4;
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
final BitVector bitVector = hll.probabilisticStorage;
|
||||
|
||||
// lower-bounds of the register
|
||||
hll.addRaw(0x000000000000001L/*'j'=1*/);
|
||||
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
|
||||
|
||||
hll.addRaw(0x0000000000000012L/*'j'=2*/);
|
||||
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
|
||||
|
||||
hll.addRaw(0x0000000000000023L/*'j'=3*/);
|
||||
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
|
||||
|
||||
hll.addRaw(0x0000000000000044L/*'j'=4*/);
|
||||
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
|
||||
|
||||
hll.addRaw(0x0000000000000085L/*'j'=5*/);
|
||||
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
|
||||
|
||||
// upper-bounds of the register
|
||||
// NOTE: bear in mind that BitVector itself does ensure that
|
||||
// overflow of a register is prevented
|
||||
hll.addRaw(0x0000000000010006L/*'j'=6*/);
|
||||
assertEquals(bitVector.getRegister(6/*'j'*/), 13);
|
||||
|
||||
hll.addRaw(0x0000000000020007L/*'j'=7*/);
|
||||
assertEquals(bitVector.getRegister(7/*'j'*/), 14);
|
||||
|
||||
hll.addRaw(0x0000000000040008L/*'j'=8*/);
|
||||
assertEquals(bitVector.getRegister(8/*'j'*/), 15);
|
||||
|
||||
hll.addRaw(0x0000000000080009L/*'j'=9*/);
|
||||
assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
|
||||
|
||||
// sanity checks to ensure that no other bits above the lowest-set
|
||||
// bit matters
|
||||
// NOTE: same as case 'j = 6' above
|
||||
hll.addRaw(0x000000000003000AL/*'j'=10*/);
|
||||
assertEquals(bitVector.getRegister(10/*'j'*/), 13);
|
||||
|
||||
hll.addRaw(0x000000000011000BL/*'j'=11*/);
|
||||
assertEquals(bitVector.getRegister(11/*'j'*/), 13);
|
||||
}
|
||||
|
||||
// register width 5
|
||||
{ // scoped locally for sanity
|
||||
final int regwidth = 5;
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
final BitVector bitVector = hll.probabilisticStorage;
|
||||
|
||||
// lower-bounds of the register
|
||||
hll.addRaw(0x0000000000000001L/*'j'=1*/);
|
||||
assertEquals(bitVector.getRegister(1/*'j'*/), 0);
|
||||
|
||||
hll.addRaw(0x0000000000000012L/*'j'=2*/);
|
||||
assertEquals(bitVector.getRegister(2/*'j'*/), 1);
|
||||
|
||||
hll.addRaw(0x0000000000000023L/*'j'=3*/);
|
||||
assertEquals(bitVector.getRegister(3/*'j'*/), 2);
|
||||
|
||||
hll.addRaw(0x0000000000000044L/*'j'=4*/);
|
||||
assertEquals(bitVector.getRegister(4/*'j'*/), 3);
|
||||
|
||||
hll.addRaw(0x0000000000000085L/*'j'=5*/);
|
||||
assertEquals(bitVector.getRegister(5/*'j'*/), 4);
|
||||
|
||||
// upper-bounds of the register
|
||||
// NOTE: bear in mind that BitVector itself does ensure that
|
||||
// overflow of a register is prevented
|
||||
hll.addRaw(0x0000000100000006L/*'j'=6*/);
|
||||
assertEquals(bitVector.getRegister(6/*'j'*/), 29);
|
||||
|
||||
hll.addRaw(0x0000000200000007L/*'j'=7*/);
|
||||
assertEquals(bitVector.getRegister(7/*'j'*/), 30);
|
||||
|
||||
hll.addRaw(0x0000000400000008L/*'j'=8*/);
|
||||
assertEquals(bitVector.getRegister(8/*'j'*/), 31);
|
||||
|
||||
hll.addRaw(0x0000000800000009L/*'j'=9*/);
|
||||
assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
/**
|
||||
* Tests {@link HLL#clear()}.
|
||||
*/
|
||||
@Test
|
||||
public void clearTest() {
|
||||
final int regwidth = 5;
|
||||
final int log2m = 4/*16 registers per counter*/;
|
||||
final int m = 1 << log2m;
|
||||
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
final BitVector bitVector = hll.probabilisticStorage;
|
||||
for(int i=0; i<m; i++)
|
||||
bitVector.setRegister(i, i);
|
||||
|
||||
hll.clear();
|
||||
for(int i=0; i<m; i++){
|
||||
assertEquals(bitVector.getRegister(i), 0L/*default value of register*/);
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Serialization
|
||||
/**
|
||||
* Tests {@link HLL#toBytes(ISchemaVersion)} and {@link HLL#fromBytes(byte[])}.
|
||||
*/
|
||||
@Test
|
||||
public void toFromBytesTest() {
|
||||
final int log2m = 11/*arbitrary*/;
|
||||
final int regwidth = 5;
|
||||
|
||||
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
|
||||
final HLLType type = HLLType.FULL;
|
||||
final int padding = schemaVersion.paddingBytes(type);
|
||||
final int dataByteCount = ProbabilisticTestUtil.getRequiredBytes(regwidth, (1 << log2m)/*aka 2^log2m = m*/);
|
||||
final int expectedByteCount = padding + dataByteCount;
|
||||
|
||||
{// Should work on an empty element
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output length is correct
|
||||
assertEquals(bytes.length, expectedByteCount);
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a partially filled element
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
|
||||
for(int i=0; i<3; i++) {
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
|
||||
hll.addRaw(rawValue);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output length is correct
|
||||
assertEquals(bytes.length, expectedByteCount);
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a full set
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
|
||||
|
||||
for(int i=0; i<(1 << log2m)/*aka 2^log2m*/; i++) {
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
|
||||
hll.addRaw(rawValue);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// assert output length is correct
|
||||
assertEquals(bytes.length, expectedByteCount);
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
}
|
||||
|
||||
// ************************************************************************
|
||||
// Assertion Helpers
|
||||
/**
|
||||
* Asserts that the two HLLs are register-wise equal.
|
||||
*/
|
||||
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
|
||||
final BitVector bitVectorA = hllA.probabilisticStorage;
|
||||
final BitVector bitVectorB = hllA.probabilisticStorage;
|
||||
|
||||
final LongIterator iterA = bitVectorA.registerIterator();
|
||||
final LongIterator iterB = bitVectorB.registerIterator();
|
||||
|
||||
for(;iterA.hasNext() && iterB.hasNext();) {
|
||||
assertEquals(iterA.next(), iterB.next());
|
||||
}
|
||||
assertFalse(iterA.hasNext());
|
||||
assertFalse(iterB.hasNext());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import static org.apache.solr.util.hll.HLL.*;
|
||||
|
||||
/**
|
||||
* Serialization smoke-tests.
|
||||
*/
|
||||
public class HLLSerializationTest extends LuceneTestCase {
|
||||
/**
|
||||
* A smoke-test that covers serialization/deserialization of an HLL
|
||||
* under all possible parameters.
|
||||
*/
|
||||
@Test
|
||||
@Slow
|
||||
@Nightly
|
||||
public void serializationSmokeTest() throws Exception {
|
||||
final Random random = new Random(randomLong());
|
||||
final int randomCount = 250;
|
||||
final List<Long> randoms = new ArrayList<Long>(randomCount);
|
||||
for (int i=0; i<randomCount; i++) {
|
||||
randoms.add(random.nextLong());
|
||||
}
|
||||
|
||||
assertCardinality(HLLType.EMPTY, randoms);
|
||||
assertCardinality(HLLType.EXPLICIT, randoms);
|
||||
assertCardinality(HLLType.SPARSE, randoms);
|
||||
assertCardinality(HLLType.FULL, randoms);
|
||||
}
|
||||
|
||||
// NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
|
||||
// completes in a reasonable amount of time. Not much is gained by
|
||||
// testing larger values - there are no more known serialization
|
||||
// related edge cases that appear as log2m gets even larger.
|
||||
// NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
|
||||
// on 2014-01-30.
|
||||
private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
|
||||
throws CloneNotSupportedException {
|
||||
for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
|
||||
for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
|
||||
for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
|
||||
for(final boolean sparse: new boolean[]{true, false}) {
|
||||
HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
|
||||
for(final Long item: items) {
|
||||
hll.addRaw(item);
|
||||
}
|
||||
HLL copy = HLL.fromBytes(hll.toBytes());
|
||||
assertEquals(copy.cardinality(), hll.cardinality());
|
||||
assertEquals(copy.getType(), hll.getType());
|
||||
assertTrue(Arrays.equals(copy.toBytes(), hll.toBytes()));
|
||||
|
||||
HLL clone = hll.clone();
|
||||
assertEquals(clone.cardinality(), hll.cardinality());
|
||||
assertEquals(clone.getType(), hll.getType());
|
||||
assertTrue(Arrays.equals(clone.toBytes(), hll.toBytes()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests {@link HLLUtil} static methods.
|
||||
*
|
||||
* @author tkarnezo
|
||||
*/
|
||||
public class HLLUtilTest extends LuceneTestCase {
|
||||
/**
|
||||
* Tests that {@link HLLUtil#largeEstimatorCutoff(int, int)} is the same
|
||||
* as a trivial implementation.
|
||||
*/
|
||||
@Test
|
||||
public void largeEstimatorCutoffTest() {
|
||||
for(int log2m=HLL.MINIMUM_LOG2M_PARAM; log2m<=HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
|
||||
for(int regWidth=HLL.MINIMUM_REGWIDTH_PARAM; regWidth<=HLL.MINIMUM_REGWIDTH_PARAM; regWidth++) {
|
||||
final double cutoff = HLLUtil.largeEstimatorCutoff(log2m, regWidth);
|
||||
|
||||
// See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
|
||||
// and original paper (Fig. 3) for information on 2^L and
|
||||
// "large range correction" cutoff.
|
||||
final double expected = Math.pow(2, Math.pow(2, regWidth) - 2 + log2m) / 30.0;
|
||||
assertEquals(cutoff, expected, 0.0001);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,708 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
import static org.apache.solr.util.hll.ProbabilisticTestUtil.*;
|
||||
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Generates test files for testing other implementations of HLL
|
||||
* serialization/deserialization, namely the PostgreSQL implementation.
|
||||
*/
|
||||
public class IntegrationTestGenerator {
|
||||
// ************************************************************************
|
||||
// directory to output the generated tests
|
||||
private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// configurations for HLLs, should mirror settings in PostgreSQL impl. tests
|
||||
private static final int REGWIDTH = 5;
|
||||
private static final int LOG2M = 11;
|
||||
// NOTE: This differs from the PostgreSQL impl. parameter 'expthresh'. This
|
||||
// is a literal threshold to use in the promotion hierarchy, implying
|
||||
// that both EXPLICIT representation should be used and it should
|
||||
// NOT be automatically computed. This is done to ensure that the
|
||||
// parameters of the test are very explicitly defined.
|
||||
private static final int EXPLICIT_THRESHOLD = 256;
|
||||
// NOTE: This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
|
||||
// is assumed to be true and this is a literal register-count threshold
|
||||
// to use in the promotion hierarchy. This is done to ensure that the
|
||||
// parameters of the test are very explicitly defined.
|
||||
private static final int SPARSE_THRESHOLD = 850;
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// computed constants
|
||||
private static final int REGISTER_COUNT = (1 << LOG2M);
|
||||
private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
|
||||
|
||||
// ========================================================================
|
||||
// Tests
|
||||
/**
|
||||
* Cumulatively adds random values to a FULL HLL through the small range
|
||||
* correction, uncorrected range, and large range correction of the HLL's
|
||||
* cardinality estimator.
|
||||
*
|
||||
* Format: cumulative add
|
||||
* Tests:
|
||||
* - FULL cardinality computation
|
||||
*/
|
||||
private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.FULL);
|
||||
initLineAdd(output, hll, schemaVersion);
|
||||
|
||||
// run through some values in the small range correction
|
||||
for(int i=0; i<((1 << LOG2M) - 1); i++) {
|
||||
final long rawValue = constructHLLValue(LOG2M, i, 1);
|
||||
cumulativeAddLine(output, hll, rawValue, schemaVersion);
|
||||
}
|
||||
|
||||
// run up past some values in the uncorrected range
|
||||
for(int i=0; i<(1 << LOG2M); i++) {
|
||||
final long rawValue = constructHLLValue(LOG2M, i, 7);
|
||||
cumulativeAddLine(output, hll, rawValue, schemaVersion);
|
||||
}
|
||||
|
||||
// run through some values in the large range correction
|
||||
for(int i=0; i<(1 << LOG2M); i++) {
|
||||
final long rawValue = constructHLLValue(LOG2M, i, 30);
|
||||
cumulativeAddLine(output, hll, rawValue, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cumulatively adds random values to an EMPTY HLL.
|
||||
*
|
||||
* Format: cumulative add
|
||||
* Tests:
|
||||
* - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
|
||||
* - EMPTY to EXPLICIT promotion
|
||||
* - EXPLICIT to SPARSE promotion
|
||||
* - SPARSE to FULL promotion
|
||||
*/
|
||||
private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
initLineAdd(output, hll, schemaVersion);
|
||||
|
||||
for(int i=0; i<10000/*arbitrary*/; i++) {
|
||||
cumulativeAddLine(output, hll, randomLong(), schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cumulatively unions "underpopulated" FULL HLLs into the
|
||||
* accumulator to verify the correct behavior from the PostgreSQL implementation.
|
||||
* The PostgreSQL implementation's representations of probabilistic HLLs should
|
||||
* depend exclusively on the chosen SPARSE-to-FULL cutoff.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U "underpopulated" FULL => SPARSE
|
||||
* - SPARSE U "underpopulated" FULL => SPARSE
|
||||
* - SPARSE U "barely underpopulated" FULL => FULL
|
||||
*/
|
||||
private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
|
||||
|
||||
final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
|
||||
|
||||
cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
|
||||
|
||||
// NOTE: In this test the sparseReference will be the "expected" value
|
||||
// from the C representation, since it doesn't choose representation
|
||||
// based on original encoding, but rather on the promotion rules
|
||||
// and the declared type of the "receiving" field.
|
||||
// It is the manually-constructed union result.
|
||||
|
||||
// "underpopulated" FULL U EMPTY => SPARSE
|
||||
final HLL fullHLL = newHLL(HLLType.FULL);
|
||||
fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
|
||||
|
||||
final HLL sparseHLL = newHLL(HLLType.SPARSE);
|
||||
sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
|
||||
|
||||
output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
|
||||
// "underpopulated" FULL (small) U SPARSE (small) => SPARSE
|
||||
final HLL fullHLL2 = newHLL(HLLType.FULL);
|
||||
fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
|
||||
|
||||
sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
|
||||
|
||||
output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
|
||||
// "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
|
||||
final HLL fullHLL3 = newHLL(HLLType.FULL);
|
||||
for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
|
||||
fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
|
||||
sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
|
||||
}
|
||||
|
||||
output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cumulatively sets successive registers to:
|
||||
*
|
||||
* <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
|
||||
*
|
||||
* by adding specifically constructed values to a SPARSE HLL.
|
||||
* Does not induce promotion.
|
||||
*
|
||||
* Format: cumulative add
|
||||
* Tests:
|
||||
* - SPARSE addition (predictable)
|
||||
*/
|
||||
private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
|
||||
|
||||
// the accumulator, starts empty sparse probabilistic
|
||||
final HLL hll = newHLL(HLLType.SPARSE);
|
||||
initLineAdd(output, hll, schemaVersion);
|
||||
|
||||
for(int i=0; i<SPARSE_THRESHOLD; i++) {
|
||||
final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
|
||||
cumulativeAddLine(output, hll, rawValue, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cumulatively sets random registers of a SPARSE HLL to
|
||||
* random values by adding random values. Does not induce promotion.
|
||||
*
|
||||
* Format: cumulative add
|
||||
* Tests:
|
||||
* - SPARSE addition (random)
|
||||
*/
|
||||
private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.SPARSE);
|
||||
initLineAdd(output, hll, schemaVersion);
|
||||
|
||||
for(int i=0; i<SPARSE_THRESHOLD; i++) {
|
||||
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
|
||||
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
|
||||
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
|
||||
|
||||
cumulativeAddLine(output, hll, rawValue, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Cumulatively sets the first register (index 0) to value 2, the last
|
||||
* register (index m-1) to value 2, and then sets registers with indices in
|
||||
* the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
|
||||
*
|
||||
* This tests for register alignment in the promotion from SPARSE
|
||||
* to FULL.
|
||||
*
|
||||
* Format: cumulative add
|
||||
* Tests:
|
||||
* - SPARSE addition
|
||||
* - SPARSE to FULL promotion
|
||||
*/
|
||||
private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.SPARSE);
|
||||
initLineAdd(output, hll, schemaVersion);
|
||||
|
||||
final long firstValue = constructHLLValue(LOG2M, 0, 2);
|
||||
cumulativeAddLine(output, hll, firstValue, schemaVersion);
|
||||
|
||||
final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
|
||||
cumulativeAddLine(output, hll, lastValue, schemaVersion);
|
||||
|
||||
for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
|
||||
final long middleValue = constructHLLValue(LOG2M, i, 1);
|
||||
|
||||
cumulativeAddLine(output, hll, middleValue, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
|
||||
* single random value.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U EXPLICIT
|
||||
* - EXPLICIT U EXPLICIT
|
||||
* - EXPLICIT to SPARSE promotion
|
||||
* - SPARSE U EXPLICIT
|
||||
*/
|
||||
private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
|
||||
// make an EXPLICIT set and populate with cardinality 1
|
||||
final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
|
||||
explicitHLL.addRaw(random.nextLong());
|
||||
|
||||
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with SPARSE HLLs, each
|
||||
* having one register set.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U SPARSE
|
||||
* - SPARSE U SPARSE
|
||||
* - SPARSE promotion
|
||||
* - SPARSE U FULL
|
||||
*/
|
||||
private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
|
||||
for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
|
||||
// make a SPARSE set and populate with cardinality 1
|
||||
final HLL sparseHLL = newHLL(HLLType.SPARSE);
|
||||
|
||||
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
|
||||
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
|
||||
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
|
||||
sparseHLL.addRaw(rawValue);
|
||||
|
||||
cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
|
||||
* random value, twice in a row to verify that the set properties are
|
||||
* satisfied.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U EXPLICIT
|
||||
* - EXPLICIT U EXPLICIT
|
||||
*/
|
||||
private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
|
||||
// make an EXPLICIT set and populate with cardinality 1
|
||||
final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
|
||||
explicitHLL.addRaw(random.nextLong());
|
||||
|
||||
// union it into the accumulator twice, to test overlap (cardinality should not change)
|
||||
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
|
||||
cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with SPARSE HLLs, each
|
||||
* having a single register set, twice in a row to verify that the set
|
||||
* properties are satisfied.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U SPARSE
|
||||
* - SPARSE U SPARSE
|
||||
*/
|
||||
private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
for(int i=0; i<SPARSE_THRESHOLD; i++) {
|
||||
// make a SPARSE set and populate with cardinality 1
|
||||
final HLL sparseHLL = newHLL(HLLType.SPARSE);
|
||||
final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
|
||||
final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
|
||||
final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
|
||||
sparseHLL.addRaw(rawValue);
|
||||
|
||||
cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with FULL HLLs, each having
|
||||
* many registers set, twice in a row to verify that the set properties are
|
||||
* satisfied.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - EMPTY U FULL
|
||||
* - FULL U FULL
|
||||
*/
|
||||
private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
|
||||
|
||||
final Random random = new Random(randomLong());
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
for(int i=0; i<1000/*number of rows to generate*/; i++) {
|
||||
// make a FULL set and populate with
|
||||
final HLL fullHLL = newHLL(HLLType.FULL);
|
||||
final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
|
||||
for(int j=0;j<elementCount;j++) {
|
||||
fullHLL.addRaw(random.nextLong());
|
||||
}
|
||||
|
||||
cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unions an EMPTY accumulator with random HLLs.
|
||||
*
|
||||
* Format: cumulative union
|
||||
* Tests:
|
||||
* - hopefully all union possibilities
|
||||
*/
|
||||
private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
|
||||
final FileWriter output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
|
||||
|
||||
// the accumulator, starts empty
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
final HLL emptyHLL = newHLL(HLLType.EMPTY);
|
||||
|
||||
cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
|
||||
|
||||
for(int i=0; i<1000/*number of rows to generate*/; i++) {
|
||||
final HLL randomHLL = generateRandomHLL();
|
||||
cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
|
||||
}
|
||||
|
||||
output.flush();
|
||||
output.close();
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Main
|
||||
public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
|
||||
fullCardinalityCorrectionTest(schemaVersion);
|
||||
globalUnionTest(schemaVersion);
|
||||
globalStepTest(schemaVersion);
|
||||
probabilisticUnionTest(schemaVersion);
|
||||
explicitPromotionTest(schemaVersion);
|
||||
explicitOverlapTest(schemaVersion);
|
||||
sparseFullRepresentationTest(schemaVersion);
|
||||
sparseStepTest(schemaVersion);
|
||||
sparseRandomTest(schemaVersion);
|
||||
sparseEdgeTest(schemaVersion);
|
||||
sparseProbabilisticPromotionTest(schemaVersion);
|
||||
sparseProbabilisticOverlapTest(schemaVersion);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
fullSuite(SerializationUtil.VERSION_ONE);
|
||||
}
|
||||
|
||||
// ************************************************************************
|
||||
// Helpers
|
||||
/**
|
||||
* Shortcut for testing constructor, which uses the constants defined at
|
||||
* the top of the file as default parameters.
|
||||
*
|
||||
* @return a new {@link HLL} of specified type, which uses the parameters
|
||||
* ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
|
||||
* and {@link #SPARSE_THRESHOLD}) specified above.
|
||||
*/
|
||||
private static HLL newHLL(final HLLType type) {
|
||||
return newHLL(type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the algorithm-specific cardinality of the specified {@link HLL}
|
||||
* as a {@link String} appropriate for comparison with the algorithm-specific
|
||||
* cardinality provided by the PostgreSQL implementation.
|
||||
*
|
||||
* @param hll the HLL whose algorithm-specific cardinality is to be printed.
|
||||
* This cannot be <code>null</code>.
|
||||
* @return the algorithm-specific cardinality of the instance as a PostgreSQL-
|
||||
* compatible String. This will never be <code>null</code>
|
||||
*/
|
||||
private static String stringCardinality(final HLL hll) {
|
||||
switch(hll.getType()) {
|
||||
case EMPTY:
|
||||
return "0";
|
||||
case EXPLICIT:/*promotion has not yet occurred*/
|
||||
return Long.toString(hll.cardinality());
|
||||
case SPARSE:
|
||||
return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
|
||||
case FULL:
|
||||
return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
|
||||
default:
|
||||
throw new RuntimeException("Unknown HLL type " + hll.getType());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a random HLL and populates it with random values.
|
||||
*
|
||||
* @return the populated HLL. This will never be <code>null</code>.
|
||||
*/
|
||||
public static HLL generateRandomHLL() {
|
||||
final int randomTypeInt = randomIntBetween(0, HLLType.values().length - 1);
|
||||
final HLLType type;
|
||||
switch(randomTypeInt) {
|
||||
case 0:
|
||||
type = HLLType.EMPTY;
|
||||
break;
|
||||
case 1:
|
||||
type = HLLType.EXPLICIT;
|
||||
break;
|
||||
case 2:
|
||||
type = HLLType.FULL;
|
||||
break;
|
||||
case 3:
|
||||
type = HLLType.EMPTY;
|
||||
break;
|
||||
case 4:
|
||||
type = HLLType.SPARSE;
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unassigned type int " + randomTypeInt);
|
||||
}
|
||||
|
||||
final int cardinalityCap;
|
||||
final int cardinalityBaseline;
|
||||
|
||||
switch(type) {
|
||||
case EMPTY:
|
||||
return newHLL(HLLType.EMPTY);
|
||||
case EXPLICIT:
|
||||
cardinalityCap = EXPLICIT_THRESHOLD;
|
||||
cardinalityBaseline = 1;
|
||||
break;
|
||||
case SPARSE:
|
||||
cardinalityCap = SPARSE_THRESHOLD;
|
||||
cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
|
||||
break;
|
||||
case FULL:
|
||||
cardinalityCap = 100000;
|
||||
cardinalityBaseline = (SPARSE_THRESHOLD*10);
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("We should never be here.");
|
||||
}
|
||||
|
||||
final HLL hll = newHLL(HLLType.EMPTY);
|
||||
for(int i=0; i<cardinalityBaseline; i++) {
|
||||
hll.addRaw(randomLong());
|
||||
}
|
||||
for(int i=0; i<randomInt(cardinalityCap - cardinalityBaseline); i++) {
|
||||
hll.addRaw(randomLong());
|
||||
}
|
||||
|
||||
return hll;
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a {@link FileWriter} and writes out an appropriate CSV header.
|
||||
*
|
||||
* @param schemaVersion Schema version of the output. This cannot be
|
||||
* <code>null</code>.
|
||||
* @param description Description string used to build the filename.
|
||||
* This cannot be <code>null</code>.
|
||||
* @param type {@link TestType type} of the test file to be written.
|
||||
* This cannot be <code>null</code>.
|
||||
* @return The opened {@link FileWriter writer}. This will never be <code>null</code>.
|
||||
*/
|
||||
private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
|
||||
final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
|
||||
final String header;
|
||||
final String filename;
|
||||
switch(type) {
|
||||
case ADD:
|
||||
header = "cardinality,raw_value,HLL\n";
|
||||
filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
|
||||
break;
|
||||
case UNION:
|
||||
header = "cardinality,HLL,union_cardinality,union_HLL\n";
|
||||
filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unknown test type " + type);
|
||||
}
|
||||
|
||||
final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
|
||||
output.write(header);
|
||||
output.flush();
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out a {@link TestType#ADD}-formatted test line.
|
||||
*
|
||||
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
|
||||
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
|
||||
* @param rawValue The raw value added to the HLL.
|
||||
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
|
||||
* be <code>null</code>.
|
||||
*/
|
||||
private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
|
||||
hll.addRaw(rawValue);
|
||||
final String accumulatorCardinality = stringCardinality(hll);
|
||||
|
||||
output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an initial line for a {@link TestType#ADD}-formatted test.
|
||||
*
|
||||
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
|
||||
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
|
||||
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
|
||||
* be <code>null</code>.
|
||||
*/
|
||||
private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
|
||||
output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out a {@link TestType#UNION}-formatted test line.
|
||||
*
|
||||
* @param output The output {@link FileWriter writer}. This cannot be <code>null</code>.
|
||||
* @param hll The "accumulator" HLL instance. This cannot be <code>null</code>.
|
||||
* @param increment The "increment" HLL instance which will be unioned into
|
||||
* the accumulator. This cannot be <code>null</code>.
|
||||
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
|
||||
* be <code>null</code>.
|
||||
*/
|
||||
private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
|
||||
hll.union(increment);
|
||||
|
||||
final String incrementCardinality = stringCardinality(increment);
|
||||
final String accumulatorCardinality = stringCardinality(hll);
|
||||
output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
|
||||
output.flush();
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
|
||||
*
|
||||
* @param hll the HLL to serialize. This cannot be <code>null</code>.
|
||||
* @param schemaVersion the schema with which to serialize the HLLs. This cannot
|
||||
* be <code>null</code>.
|
||||
* @return a PostgreSQL 'bytea' string representing the HLL.
|
||||
*/
|
||||
private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates what kind of test output a test will generate.
|
||||
*/
|
||||
private static enum TestType {
|
||||
/**
|
||||
* This type of test is characterized by values being added to an
|
||||
* accumulator HLL whose serialized representation (after the value is added)
|
||||
* is printed to each line along with the cardinality and added value.
|
||||
*/
|
||||
ADD,
|
||||
/**
|
||||
* This type of test is characterized by HLLs being unioned into an
|
||||
* accumulator HLL whose serialized representation (after the HLL is
|
||||
* union'd) is printed to each line along with the cardinalities and the
|
||||
* serialized representation of the HLL union'd in.
|
||||
*/
|
||||
UNION;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
/**
|
||||
* A collection of test utilities for constructing input values to HLLs and for
|
||||
* computing their serialized size.
|
||||
*/
|
||||
public class ProbabilisticTestUtil {
|
||||
/**
|
||||
* Constructs a value that when added raw to a HLL will set the register at
|
||||
* <code>registerIndex</code> to <code>registerValue</code>.
|
||||
*
|
||||
* @param log2m the log-base-2 of the number of registers in the HLL
|
||||
* @param registerIndex the index of the register to set
|
||||
* @param registerValue the value to set the register to
|
||||
* @return the value
|
||||
*/
|
||||
public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
|
||||
final long partition = registerIndex;
|
||||
final long substreamValue = (1L << (registerValue - 1));
|
||||
return (substreamValue << log2m) | partition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the HLL register index from a raw value.
|
||||
*/
|
||||
public static short getRegisterIndex(final long rawValue, final int log2m) {
|
||||
final long mBitsMask = (1 << log2m) - 1;
|
||||
final short j = (short)(rawValue & mBitsMask);
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts the HLL register value from a raw value.
|
||||
*/
|
||||
public static byte getRegisterValue(final long rawValue, final int log2m) {
|
||||
final long substreamValue = (rawValue >>> log2m);
|
||||
final byte p_w;
|
||||
|
||||
if (substreamValue == 0L) {
|
||||
// The paper does not cover p(0x0), so the special value 0 is used.
|
||||
// 0 is the original initialization value of the registers, so by
|
||||
// doing this the HLL simply ignores it. This is acceptable
|
||||
// because the probability is 1/(2^(2^registerSizeInBits)).
|
||||
p_w = 0;
|
||||
} else {
|
||||
p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
|
||||
}
|
||||
|
||||
return p_w;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes required to pack <code>registerCount</code>
|
||||
* registers of width <code>shortWordLength</code>.
|
||||
*/
|
||||
public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
|
||||
return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,453 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.util.hll;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.carrotsearch.hppc.IntByteOpenHashMap;
|
||||
import com.carrotsearch.hppc.cursors.IntByteCursor;
|
||||
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
||||
|
||||
/**
|
||||
* Tests {@link HLL} of type {@link HLLType#SPARSE}.
|
||||
*/
|
||||
public class SparseHLLTest extends LuceneTestCase {
|
||||
private static final int log2m = 11;
|
||||
|
||||
/**
|
||||
* Tests {@link HLL#addRaw(long)}.
|
||||
*/
|
||||
@Test
|
||||
public void addTest() {
|
||||
{ // insert an element with register value 1 (minimum set value)
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 1;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
|
||||
}
|
||||
{ // insert an element with register value 31 (maximum set value)
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 31;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
|
||||
}
|
||||
{ // insert an element that could overflow the register (past 31)
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 36;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
|
||||
}
|
||||
{ // insert duplicate elements, observe no change
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 1;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
|
||||
}
|
||||
{ // insert elements that increase a register's value
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 1;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
final int registerValue2 = 2;
|
||||
final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
|
||||
hll.addRaw(rawValue2);
|
||||
|
||||
assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
|
||||
}
|
||||
{ // insert elements that have lower register values, observe no change
|
||||
final int registerIndex = 0;
|
||||
final int registerValue = 2;
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
|
||||
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(rawValue);
|
||||
|
||||
final int registerValue2 = 1;
|
||||
final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
|
||||
hll.addRaw(rawValue2);
|
||||
|
||||
assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the small
|
||||
* range correction.
|
||||
*/
|
||||
@Test
|
||||
public void smallRangeSmokeTest() {
|
||||
final int log2m = 11;
|
||||
final int m = (1 << log2m);
|
||||
final int regwidth = 5;
|
||||
|
||||
// only one register set
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
// Trivially true that small correction conditions hold: one register
|
||||
// set implies zeroes exist, and estimator trivially smaller than 5m/2.
|
||||
// Small range correction: m * log(m/V)
|
||||
final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
|
||||
// all but one register set
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
|
||||
for(int i=0; i<(m - 1); i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
|
||||
}
|
||||
|
||||
// Trivially true that small correction conditions hold: all but
|
||||
// one register set implies a zero exists, and estimator trivially
|
||||
// smaller than 5m/2 since it's alpha / ((m-1)/2)
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
// Small range correction: m * log(m/V)
|
||||
final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the
|
||||
* uncorrected estimator.
|
||||
*/
|
||||
@Test
|
||||
public void normalRangeSmokeTest() {
|
||||
final int log2m = 11;
|
||||
final int m = (1 << log2m);
|
||||
final int regwidth = 5;
|
||||
// regwidth = 5, so hash space is
|
||||
// log2m + (2^5 - 1 - 1), so L = log2m + 30
|
||||
final int l = log2m + 30;
|
||||
|
||||
// all registers at 'medium' value
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
|
||||
|
||||
final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
|
||||
for(int i=0; i<m; i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
|
||||
}
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
// Simplified estimator when all registers take same value: alpha / (m/2^val)
|
||||
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
|
||||
|
||||
// Assert conditions for uncorrected range
|
||||
assertTrue(estimator <= Math.pow(2,l)/30);
|
||||
assertTrue(estimator > (5 * m /(double)2));
|
||||
|
||||
final long expected = (long)Math.ceil(estimator);
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke test for {@link HLL#cardinality()} and the proper use of the large
|
||||
* range correction.
|
||||
*/
|
||||
@Test
|
||||
public void largeRangeSmokeTest() {
|
||||
final int log2m = 11;
|
||||
final int m = (1 << log2m);
|
||||
final int regwidth = 5;
|
||||
// regwidth = 5, so hash space is
|
||||
// log2m + (2^5 - 1 - 1), so L = log2m + 30
|
||||
final int l = log2m + 30;
|
||||
|
||||
// all registers at large value
|
||||
{
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
|
||||
|
||||
final int registerValue = 31/*chosen to ensure large correction kicks in*/;
|
||||
for(int i=0; i<m; i++) {
|
||||
hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
|
||||
}
|
||||
|
||||
final long cardinality = hll.cardinality();
|
||||
|
||||
|
||||
// Simplified estimator when all registers take same value: alpha / (m/2^val)
|
||||
final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
|
||||
|
||||
// Assert conditions for large range
|
||||
assertTrue(estimator > Math.pow(2, l)/30);
|
||||
|
||||
// Large range correction: -2^32 * log(1 - E/2^32)
|
||||
final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
|
||||
assertEquals(cardinality, expected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests {@link HLL#union(HLL)}.
|
||||
*/
|
||||
@Test
|
||||
public void unionTest() {
|
||||
final int log2m = 11/*arbitrary*/;
|
||||
final int sparseThreshold = 256/*arbitrary*/;
|
||||
|
||||
{ // two empty multisets should union to an empty set
|
||||
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
|
||||
hllA.union(hllB);
|
||||
|
||||
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
|
||||
assertEquals(hllA.cardinality(), 0L);
|
||||
}
|
||||
{ // two disjoint multisets should union properly
|
||||
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
|
||||
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
|
||||
|
||||
|
||||
hllA.union(hllB);
|
||||
|
||||
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
|
||||
assertEquals(hllA.cardinality(), 3L/*precomputed*/);
|
||||
assertRegisterPresent(hllA, 1, (byte)1);
|
||||
assertRegisterPresent(hllA, 2, (byte)1);
|
||||
}
|
||||
{ // two exactly overlapping multisets should union properly
|
||||
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
|
||||
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
|
||||
|
||||
hllA.union(hllB);
|
||||
|
||||
assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
|
||||
assertEquals(hllA.cardinality(), 2L/*precomputed*/);
|
||||
assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
|
||||
}
|
||||
{ // overlapping multisets should union properly
|
||||
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
// register index = 3
|
||||
final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
|
||||
|
||||
// register index = 4
|
||||
final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
|
||||
final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
|
||||
|
||||
// register index = 5
|
||||
final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
|
||||
|
||||
hllA.addRaw(rawValueA);
|
||||
hllA.addRaw(rawValueB);
|
||||
|
||||
hllB.addRaw(rawValueBPrime);
|
||||
hllB.addRaw(rawValueC);
|
||||
|
||||
hllA.union(hllB);
|
||||
// union should have three registers set, with partition B set to the
|
||||
// max of the two registers
|
||||
assertRegisterPresent(hllA, 3, (byte)11);
|
||||
assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
|
||||
assertRegisterPresent(hllA, 5, (byte)14);
|
||||
}
|
||||
{ // too-large unions should promote
|
||||
final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
|
||||
// fill up sets to maxCapacity
|
||||
for(int i=0; i<sparseThreshold; i++) {
|
||||
hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
|
||||
hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
|
||||
}
|
||||
|
||||
hllA.union(hllB);
|
||||
|
||||
assertEquals(hllA.getType(), HLLType.FULL);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests {@link HLL#clear()}.
|
||||
*/
|
||||
@Test
|
||||
public void clearTest() {
|
||||
final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
|
||||
hll.addRaw(1L);
|
||||
hll.clear();
|
||||
assertEquals(hll.cardinality(), 0L);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests {@link HLL#toBytes(ISchemaVersion)} and
|
||||
* {@link HLL#fromBytes(byte[])}.
|
||||
*/
|
||||
@Test
|
||||
public void toFromBytesTest() {
|
||||
final int log2m = 11/*arbitrary*/;
|
||||
final int regwidth = 5/*arbitrary*/;
|
||||
final int sparseThreshold = 256/*arbitrary*/;
|
||||
final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
|
||||
|
||||
final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
|
||||
final HLLType type = HLLType.SPARSE;
|
||||
final int padding = schemaVersion.paddingBytes(type);
|
||||
|
||||
{// Should work on an empty element
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// output should just be padding since no registers are used
|
||||
assertEquals(bytes.length, padding);
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a partially filled element
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
|
||||
for(int i=0; i<3; i++) {
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
|
||||
hll.addRaw(rawValue);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
{// Should work on a full set
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
|
||||
for(int i=0; i<sparseThreshold; i++) {
|
||||
final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
|
||||
hll.addRaw(rawValue);
|
||||
}
|
||||
|
||||
final byte[] bytes = hll.toBytes(schemaVersion);
|
||||
|
||||
// 'short words' should be 12 bits + 5 bits = 17 bits long
|
||||
assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
|
||||
|
||||
final HLL inHLL = HLL.fromBytes(bytes);
|
||||
|
||||
// assert register values correct
|
||||
assertElementsEqual(hll, inHLL);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Smoke tests the multisets by adding random values.
|
||||
*/
|
||||
@Test
|
||||
public void randomValuesTest() {
|
||||
final int log2m = 11/*arbitrary*/;
|
||||
final int regwidth = 5/*arbitrary*/;
|
||||
final int sparseThreshold = 256/*arbitrary*/;
|
||||
|
||||
for(int run=0; run<100; run++) {
|
||||
final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
|
||||
|
||||
final IntByteOpenHashMap map = new IntByteOpenHashMap();
|
||||
|
||||
for(int i=0; i<sparseThreshold; i++) {
|
||||
final long rawValue = RandomizedTest.randomLong();
|
||||
|
||||
final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
|
||||
final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
|
||||
if(map.get(registerIndex) < registerValue) {
|
||||
map.put(registerIndex, registerValue);
|
||||
}
|
||||
|
||||
hll.addRaw(rawValue);
|
||||
}
|
||||
|
||||
for (IntByteCursor c : map) {
|
||||
final byte expectedRegisterValue = map.get(c.key);
|
||||
assertRegisterPresent(hll, c.key, expectedRegisterValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//*************************************************************************
|
||||
// assertion helpers
|
||||
/**
|
||||
* Asserts that the register at the specified index is set to the specified
|
||||
* value.
|
||||
*/
|
||||
private static void assertRegisterPresent(final HLL hll,
|
||||
final int registerIndex,
|
||||
final int registerValue) {
|
||||
final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
|
||||
assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts that only the specified register is set and has the specified value.
|
||||
*/
|
||||
private static void assertOneRegisterSet(final HLL hll,
|
||||
final int registerIndex,
|
||||
final byte registerValue) {
|
||||
final IntByteOpenHashMap sparseProbabilisticStorage = hll.sparseProbabilisticStorage;
|
||||
assertEquals(sparseProbabilisticStorage.size(), 1);
|
||||
assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts that all registers in the two {@link HLL} instances are identical.
|
||||
*/
|
||||
private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
|
||||
final IntByteOpenHashMap sparseProbabilisticStorageA = hllA.sparseProbabilisticStorage;
|
||||
final IntByteOpenHashMap sparseProbabilisticStorageB = hllB.sparseProbabilisticStorage;
|
||||
assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
|
||||
for (IntByteCursor c : sparseProbabilisticStorageA) {
|
||||
assertEquals(sparseProbabilisticStorageA.get(c.key),
|
||||
sparseProbabilisticStorageB.get(c.key));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue