Collections-824: Optimize SimpleHasher.forEachIndex and SimpleHasher name change (#320)

* Renamed simple hasher as EnhancedDoubleHasher

* Added test for number of bits < number of hash functions

* Added IncrementingHasher for testing and updated tests

* Added test for number of bits < number of hash functions

* Fixed uniqueIndices implementation

Added default implementation.
Added test for unique filter working.
This commit is contained in:
Claude Warren 2022-08-08 08:44:37 +01:00 committed by GitHub
parent a43e0245ba
commit df091173cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 497 additions and 381 deletions

View File

@ -0,0 +1,227 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Objects;
import java.util.function.IntPredicate;
/**
* A Hasher that implements combinatorial hashing as as described by
* <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a> using the enhanced double hashing technique
* described in the wikipedia article <a href="https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing">Double Hashing</a>.
* <p>
* Common use for this hasher is to generate bit indices from a byte array output of a hashing
* or MessageDigest algorithm.</p>
*
* <h2>Thoughts on the hasher input</h2>
*
*<p>Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than
* the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be
* the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits
* changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for
* all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial}
* and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the
* tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also
* ignores the negative wrapping but the behaviour is the same, some bits cannot be reached.
* </p><p>
* So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger
* than the number of bits then the modulus will create a 'random' position and increment within the size.
* </p>
*
* @since 4.5
*/
public class EnhancedDoubleHasher implements Hasher {
/**
* The initial hash value.
*/
private final long initial;
/**
* The value to increment the hash value by.
*/
private final long increment;
/**
* Convert bytes to big-endian long filling with zero bytes as necessary..
* @param byteArray the byte array to extract the values from.
* @param offset the offset to start extraction from.
* @param len the length of the extraction, may be longer than 8.
* @return
*/
private static long toLong(byte[] byteArray, int offset, int len) {
long val = 0;
int shift = Long.SIZE;
final int end = offset + Math.min(len, Long.BYTES);
for (int i = offset; i < end; i++) {
shift -= Byte.SIZE;
val |= ((long) (byteArray[i] & 0xFF) << shift);
}
return val;
}
/**
* Constructs the EnhancedDoubleHasher from a byte array.
* <p>
* This method simplifies the conversion from a Digest or hasher algorithm output
* to the two values used by the EnhancedDoubleHasher.</p>
* <p>The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value.
* Excess bytes are ignored.
* If there are fewer than 16 bytes the following conversions are made.
*</p>
* <ol>
* <li>If there is an odd number of bytes the excess byte is assigned to the increment value</li>
* <li>The bytes alloted are read in big-endian order any byte not populated is set to zero.</li>
* </ol>
* <p>
* This ensures that small arrays generate the largest possible increment and initial values.
* </p>
* @param buffer the buffer to extract the longs from.
* @throws IllegalArgumentException is buffer length is zero.
*/
public EnhancedDoubleHasher(byte[] buffer) {
if (buffer.length == 0) {
throw new IllegalArgumentException("buffer length must be greater than 0");
}
// divide by 2
int segment = buffer.length / 2;
this.initial = toLong(buffer, 0, segment);
this.increment = toLong(buffer, segment, buffer.length - segment);
}
/**
* Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values.
* @param initial The initial value for the hasher.
* @param increment The value to increment the hash by on each iteration.
*/
public EnhancedDoubleHasher(long initial, long increment) {
this.initial = initial;
this.increment = increment;
}
/**
* Gets the initial value for the hash calculation.
* @return the initial value for the hash calculation.
*/
long getInitial() {
return initial;
}
/**
* Gets the increment value for the hash calculation.
* @return the increment value for the hash calculation.
*/
long getIncrement() {
return increment;
}
/**
* Performs a modulus calculation on an unsigned long and an integer divisor.
* @param dividend a unsigned long value to calculate the modulus of.
* @param divisor the divisor for the modulus calculation.
* @return the remainder or modulus value.
*/
static int mod(long dividend, int divisor) {
// See Hacker's Delight (2nd ed), section 9.3.
// Assume divisor is positive.
// Divide half the unsigned number and then double the quotient result.
final long quotient = ((dividend >>> 1) / divisor) << 1;
final long remainder = dividend - quotient * divisor;
// remainder in [0, 2 * divisor)
return (int) (remainder >= divisor ? remainder - divisor : remainder);
}
@Override
public IndexProducer indices(final Shape shape) {
Objects.requireNonNull(shape, "shape");
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
final int bits = shape.getNumberOfBits();
// Enhanced double hashing:
// hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits
// See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing
//
// Essentially this is computing a wrapped modulus from a start point and an
// increment and an additional term as a tetrahedral number.
// You only need two modulus operations before the loop. Within the loop
// the modulus is handled using the sign bit to detect wrapping to ensure:
// 0 <= index < bits
// 0 <= inc < bits
// The final hash is:
// hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits)
int index = mod(initial, bits);
int inc = mod(increment, bits);
final int k = shape.getNumberOfHashFunctions();
if (k > bits) {
for (int j = k; j > 0;) {
// handle k > bits
final int block = Math.min(j, bits);
j -= block;
for (int i = 0; i < block; i++) {
if (!consumer.test(index)) {
return false;
}
// Update index and handle wrapping
index -= inc;
index = index < 0 ? index + bits : index;
// Incorporate the counter into the increment to create a
// tetrahedral number additional term, and handle wrapping.
inc -= i;
inc = inc < 0 ? inc + bits : inc;
}
}
} else {
for (int i = 0; i < k; i++) {
if (!consumer.test(index)) {
return false;
}
// Update index and handle wrapping
index -= inc;
index = index < 0 ? index + bits : index;
// Incorporate the counter into the increment to create a
// tetrahedral number additional term, and handle wrapping.
inc -= i;
inc = inc < 0 ? inc + bits : inc;
}
}
return true;
}
@Override
public int[] asIndexArray() {
int[] result = new int[shape.getNumberOfHashFunctions()];
int[] idx = new int[1];
// This method needs to return duplicate indices
forEachIndex(i -> {
result[idx[0]++] = i;
return true;
});
return result;
}
};
}
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Objects;
/**
* A Hasher creates IndexProducer based on the hash implementation and the
* provided Shape.
@ -52,5 +54,10 @@ public interface Hasher {
* @param shape the shape of the desired Bloom filter.
* @return the iterator of integers
*/
IndexProducer uniqueIndices(Shape shape);
default IndexProducer uniqueIndices(Shape shape) {
return consumer -> {
Objects.requireNonNull(consumer, "consumer");
return Hasher.this.indices(shape).forEachIndex(IndexFilter.create(shape, consumer));
};
}
}

View File

@ -186,20 +186,6 @@ public final class Shape implements Comparable<Shape> {
return -(m / k) * Math.log1p(-c / m);
}
/**
* The factory to assist in the creation of proper Shapes.
*
* In the methods of this factory the `from` names are appended with the standard variable
* names in the order expected:
*
* <dl>
* <dt>{@code N})</dt><dd>The number of items to be placed in the Bloom filter</dd>
* <dt>{@code M})</dt><dd>The number of bits in the Bloom filter</dd>
* <dt>{@code K})</dt><dd>The number of hash functions for each item placed in the Bloom filter</dd>
* <dt>{@code P})</dt><dd>The probability of a collision once N items have been placed in the Bloom filter</dd>
* </dl>
*/
/**
* Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
* specified number of bits ({@code m}) and hash functions ({@code k}).

View File

@ -1,196 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Objects;
import java.util.function.IntPredicate;
/**
* A Hasher that implements combinatorial hashing as as described by
* <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a>.
* <p>
* Common use for this hasher is to generate a byte array as the output of a hashing
* or MessageDigest algorithm.</p>
*
* @since 4.5
*/
public class SimpleHasher implements Hasher {
/**
* The initial hash value.
*/
private final long initial;
/**
* The value to increment the hash value by.
*/
private final long increment;
/**
* Convert bytes to long.
* @param byteArray the byte array to extract the values from.
* @param offset the offset to start extraction from.
* @param len the length of the extraction, may be longer than 8.
* @return
*/
private static long toLong(byte[] byteArray, int offset, int len) {
long val = 0;
len = Math.min(len, Long.BYTES);
for (int i = 0; i < len; i++) {
val <<= 8;
val |= (byteArray[offset + i] & 0x00FF);
}
return val;
}
/**
* Constructs the SimpleHasher from a byte array.
* <p>The byte array is split in 2 and each half is interpreted as a long value.
* Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output
* to the two values used by the SimpleHasher.</p>
* <p><em>If the second long is zero the default increment is used instead.</em></p>
* @param buffer the buffer to extract the longs from.
* @throws IllegalArgumentException is buffer length is zero.
* @see #getDefaultIncrement()
*/
public SimpleHasher(byte[] buffer) {
if (buffer.length == 0) {
throw new IllegalArgumentException("buffer length must be greater than 0");
}
int segment = buffer.length / 2;
this.initial = toLong(buffer, 0, segment);
long possibleIncrement = toLong(buffer, segment, buffer.length - segment);
this.increment = possibleIncrement == 0 ? getDefaultIncrement() : possibleIncrement;
}
/**
* Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values.
* <p><em>If the increment is zero the default increment is used instead.</em></p>
* @param initial The initial value for the hasher.
* @param increment The value to increment the hash by on each iteration.
* @see #getDefaultIncrement()
*/
public SimpleHasher(long initial, long increment) {
this.initial = initial;
this.increment = increment == 0 ? getDefaultIncrement() : increment;
}
/**
* Get the default increment used when the requested increment is zero.
* <p>
* By default this is the same
* default increment used in Java's SplittableRandom random number generator. It is the
* fractional representation of the golden ratio (0.618...) with a base of 2^64.
* </p><p>
* Implementations may want to override this value to match defaults in legacy implementations.
* </p>
* @return The default increment to use when the requested increment is zero.
*/
public long getDefaultIncrement() {
return 0x9e3779b97f4a7c15L;
}
/**
* Performs a modulus calculation on an unsigned long and an integer divisor.
* @param dividend a unsigned long value to calculate the modulus of.
* @param divisor the divisor for the modulus calculation.
* @return the remainder or modulus value.
*/
static int mod(long dividend, int divisor) {
// See Hacker's Delight (2nd ed), section 9.3.
// Assume divisor is positive.
// Divide half the unsigned number and then double the quotient result.
final long quotient = ((dividend >>> 1) / divisor) << 1;
final long remainder = dividend - quotient * divisor;
// remainder in [0, 2 * divisor)
return (int) (remainder >= divisor ? remainder - divisor : remainder);
}
@Override
public IndexProducer indices(final Shape shape) {
Objects.requireNonNull(shape, "shape");
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
int bits = shape.getNumberOfBits();
/*
* Essentially this is computing a wrapped modulus from a start point and an
* increment. So actually you only need two modulus operations before the loop.
* This avoids any modulus operation inside the while loop. It uses a long index
* to avoid overflow.
*/
long index = mod(initial, bits);
int inc = mod(increment, bits);
for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) {
if (!consumer.test((int) index)) {
return false;
}
index += inc;
index = index >= bits ? index - bits : index;
}
return true;
}
@Override
public int[] asIndexArray() {
int[] result = new int[shape.getNumberOfHashFunctions()];
int[] idx = new int[1];
/*
* This method needs to return duplicate indices
*/
forEachIndex(i -> {
result[idx[0]++] = i;
return true;
});
return result;
}
};
}
@Override
public IndexProducer uniqueIndices(final Shape shape) {
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
IntPredicate filter = IndexFilter.create(shape, consumer);
int bits = shape.getNumberOfBits();
// Set up for the modulus. Use a long index to avoid overflow.
long index = mod(initial, bits);
int inc = mod(increment, bits);
for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) {
if (!filter.test((int) index)) {
return false;
}
index += inc;
index = index >= bits ? index - bits : index;
}
return true;
}
};
}
}

View File

@ -30,15 +30,15 @@ import org.junit.jupiter.api.Test;
*/
public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
protected final SimpleHasher from1 = new SimpleHasher(1, 1);
protected final Hasher from1 = new IncrementingHasher(1, 1);
protected final long from1Value = 0x3fffeL;
protected final SimpleHasher from11 = new SimpleHasher(11, 1);
protected final Hasher from11 = new IncrementingHasher(11, 1);
protected final long from11Value = 0xffff800L;
protected final HasherCollection bigHasher = new HasherCollection(from1, from11);
protected final long bigHashValue = 0xffffffeL;
protected final HasherCollection fullHasher = new HasherCollection(new SimpleHasher(0, 1)/* 0-16 */,
new SimpleHasher(17, 1)/* 17-33 */, new SimpleHasher(33, 1)/* 33-49 */, new SimpleHasher(50, 1)/* 50-66 */,
new SimpleHasher(67, 1)/* 67-83 */
protected final HasherCollection fullHasher = new HasherCollection(new IncrementingHasher(0, 1)/* 0-16 */,
new IncrementingHasher(17, 1)/* 17-33 */, new IncrementingHasher(33, 1)/* 33-49 */, new IncrementingHasher(50, 1)/* 50-66 */,
new IncrementingHasher(67, 1)/* 67-83 */
);
protected final long[] fullHashValue = { 0xffffffffffffffffL, 0xfffffL };
@ -150,18 +150,18 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
assertFalse(bf1.contains(bf2), "BF should not contain BF2");
assertTrue(bf2.contains(bf1), "BF2 should contain BF");
assertTrue(bf2.contains(new SimpleHasher(1, 1)), "BF2 Should contain this hasher");
assertFalse(bf2.contains(new SimpleHasher(1, 3)), "BF2 Should not contain this hasher");
assertTrue(bf2.contains(new IncrementingHasher(1, 1)), "BF2 Should contain this hasher");
assertFalse(bf2.contains(new IncrementingHasher(1, 3)), "BF2 Should not contain this hasher");
IndexProducer indexProducer = new SimpleHasher(1, 1).indices(getTestShape());
IndexProducer indexProducer = new IncrementingHasher(1, 1).indices(getTestShape());
assertTrue(bf2.contains(indexProducer), "BF2 Should contain this hasher");
indexProducer = new SimpleHasher(1, 3).indices(getTestShape());
indexProducer = new IncrementingHasher(1, 3).indices(getTestShape());
assertFalse(bf2.contains(indexProducer), "BF2 Should not contain this hasher");
BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 1).indices(getTestShape()),
BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new IncrementingHasher(1, 1).indices(getTestShape()),
getTestShape().getNumberOfBits());
assertTrue(bf2.contains(bitMapProducer), "BF2 Should contain this hasher");
bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 3).indices(getTestShape()),
bitMapProducer = BitMapProducer.fromIndexProducer(new IncrementingHasher(1, 3).indices(getTestShape()),
getTestShape().getNumberOfBits());
assertFalse(bf2.contains(bitMapProducer), "BF2 Should not contain this hasher");
@ -228,11 +228,11 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
// the data provided above do not generate an estimate that is equivalent to the
// actual.
filter1.merge(new SimpleHasher(4, 1));
filter1.merge(new IncrementingHasher(4, 1));
assertEquals(1, filter1.estimateN());
filter1.merge(new SimpleHasher(17, 1));
filter1.merge(new IncrementingHasher(17, 1));
assertEquals(3, filter1.estimateN());
}
@ -244,7 +244,7 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
public final void testAsBitMapArray() {
// test when multiple long values are returned.
final SimpleHasher hasher = new SimpleHasher(63, 1);
final IncrementingHasher hasher = new IncrementingHasher(63, 1);
final BloomFilter bf = createFilter(Shape.fromKM(2, 72), hasher);
final long[] lb = bf.asBitMapArray();
assertEquals(2, lb.length);
@ -265,7 +265,7 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
filter = createFilter(getTestShape(), fullHasher);
assertTrue(filter.isFull(), "Should be full");
filter = createFilter(getTestShape(), new SimpleHasher(1, 3));
filter = createFilter(getTestShape(), new IncrementingHasher(1, 3));
assertFalse(filter.isFull(), "Should not be full");
}
@ -313,12 +313,12 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
// test error when bloom filter returns values out of range
final BloomFilter bf5 = new SimpleBloomFilter(
Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE),
new SimpleHasher(Long.SIZE * 2, 1));
new IncrementingHasher(Long.SIZE * 2, 1));
assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf5));
final BloomFilter bf6 = new SparseBloomFilter(
Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE),
new SimpleHasher(Long.SIZE * 2, 1));
new IncrementingHasher(Long.SIZE * 2, 1));
assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf6));
}

View File

@ -234,7 +234,7 @@ public abstract class AbstractCountingBloomFilterTest<T extends CountingBloomFil
// create a hasher that produces duplicates with the specified shape.
// this setup produces 5, 17, 29, 41, 53, 65 two times
Shape shape = Shape.fromKM(12, 72);
SimpleHasher hasher = new SimpleHasher(5, 12);
Hasher hasher = new IncrementingHasher(5, 12);
CountingBloomFilter bf1 = createFilter(shape, hasher);
assertEquals(6, bf1.cardinality());

View File

@ -19,10 +19,13 @@ package org.apache.commons.collections4.bloomfilter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.junit.Test;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
@ -63,7 +66,7 @@ public abstract class AbstractHasherTest extends AbstractIndexProducerTest {
}
@ParameterizedTest
@CsvSource({ "17, 72", "3, 14", "5, 67868", })
@CsvSource({ "17, 72", "3, 14", "5, 67868", "75, 10"})
public void testHashing(int k, int m) {
int[] count = { 0 };
Hasher hasher = createHasher();
@ -74,16 +77,28 @@ public abstract class AbstractHasherTest extends AbstractIndexProducerTest {
});
assertEquals(k * getHasherSize(hasher), count[0],
() -> String.format("Did not produce k=%d * m=%d indices", k, getHasherSize(hasher)));
// test early exit
count[0] = 0;
hasher.indices(Shape.fromKM(k, m)).forEachIndex(i -> {
assertTrue(i >= 0 && i < m, () -> "Out of range: " + i + ", m=" + m);
count[0]++;
return false;
});
assertEquals(1, count[0], "did not exit early" );
}
@Test
public void testUniqueIndex() {
// create a hasher that produces duplicates with the specified shape.
// this setup produces 5, 17, 29, 41, 53, 65 two times
Shape shape = Shape.fromKM(12, 72);
Hasher hasher = new SimpleHasher(5, 12);
Set<Integer> set = new HashSet<>();
assertTrue(hasher.uniqueIndices(shape).forEachIndex(set::add), "Duplicate detected");
assertEquals(6, set.size());
// generating 11 numbers in the range of [0,9] will yield at least on collision.
Shape shape = Shape.fromKM(11, 10);
Hasher hasher = createHasher();
IndexProducer producer = hasher.indices(shape);
List<Integer> full = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList());
producer = hasher.uniqueIndices(shape);
List<Integer> unique = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList());
assertTrue( full.size() > unique.size() );
Set<Integer> set = new HashSet<Integer>( unique );
assertEquals( set.size(), unique.size() );
}
}

View File

@ -23,7 +23,7 @@ public class BitCountProducerFromArrayCountingBloomFilterTest extends AbstractBi
@Override
protected BitCountProducer createProducer() {
ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape);
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
filter.merge(hasher);
return filter;
}

View File

@ -23,7 +23,7 @@ public class BitMapProducerFromArrayCountingBloomFilterTest extends AbstractBitM
@Override
protected BitMapProducer createProducer() {
ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape);
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
filter.merge(hasher);
return filter;
}

View File

@ -22,7 +22,7 @@ public class BitMapProducerFromSimpleBloomFilterTest extends AbstractBitMapProdu
@Override
protected BitMapProducer createProducer() {
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
return new SimpleBloomFilter(shape, hasher);
}

View File

@ -22,7 +22,7 @@ public class BitMapProducerFromSparseBloomFilterTest extends AbstractBitMapProdu
@Override
protected BitMapProducer createProducer() {
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
return new SparseBloomFilter(shape, hasher);
}

View File

@ -52,7 +52,7 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
@Test
public void testDefaultBloomFilterSimpleSpecificMerge() {
AbstractDefaultBloomFilter filter = new SparseDefaultBloomFilter(Shape.fromKM(3, 150));
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
assertTrue(filter.merge(hasher));
assertEquals(3, filter.cardinality());
}
@ -62,7 +62,7 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
public void testDefaultBloomFilterSparseSpecificMerge() {
Shape shape = Shape.fromKM(3, 150);
AbstractDefaultBloomFilter filter = new SparseDefaultBloomFilter(shape);
AbstractDefaultBloomFilter filter2 = new SparseDefaultBloomFilter(shape, new SimpleHasher(0, 1));
AbstractDefaultBloomFilter filter2 = new SparseDefaultBloomFilter(shape, new IncrementingHasher(0, 1));
BloomFilter newFilter = filter.copy();
newFilter.merge(filter2);
assertEquals(3, newFilter.cardinality());
@ -70,7 +70,7 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
@Test
public void testHasherBasedMergeInPlaceWithDifferingSparseness() {
Hasher hasher = new SimpleHasher(1, 1);
Hasher hasher = new IncrementingHasher(1, 1);
BloomFilter bf1 = new NonSparseDefaultBloomFilter(getTestShape());
bf1.merge(hasher);

View File

@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import org.junit.jupiter.api.Test;
/**
* Tests the {@link EnhancedDoubleHasher}.
*/
public class EnhancedDoubleHasherTest extends AbstractHasherTest {
@Override
protected Hasher createHasher() {
return new EnhancedDoubleHasher(1, 1);
}
@Override
protected Hasher createEmptyHasher() {
return NullHasher.INSTANCE;
}
@Override
protected int getHasherSize(Hasher hasher) {
return 1;
}
@Test
public void testByteConstructor() {
// single value become increment.
EnhancedDoubleHasher hasher = new EnhancedDoubleHasher( new byte[] { 1 } );
assertEquals( 0, hasher.getInitial() );
assertEquals( 0x01_00_00_00_00_00_00_00L, hasher.getIncrement() );
// 2 bytes become initial and increment.
hasher = new EnhancedDoubleHasher( new byte[] { 1, 2 } );
assertEquals( 0x01_00_00_00_00_00_00_00L, hasher.getInitial() );
assertEquals( 0x200000000000000L, hasher.getIncrement() );
// odd values place extra byte in increment.
hasher = new EnhancedDoubleHasher( new byte[] { 1, 2, 3 } );
assertEquals( 0x01_00_00_00_00_00_00_00L, hasher.getInitial() );
assertEquals( 0x203000000000000L, hasher.getIncrement() );
// even short split
hasher = new EnhancedDoubleHasher( new byte[] {0, 1, 0, 2 } );
assertEquals( 0x01_00_00_00_00_00_00L, hasher.getInitial() );
assertEquals( 0x02_00_00_00_00_00_00L, hasher.getIncrement() );
// longs are parse correctly
hasher = new EnhancedDoubleHasher( new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2 } );
assertEquals( 1, hasher.getInitial() );
assertEquals( 2, hasher.getIncrement() );
// excess bytes are ignored before mid point and at end
hasher = new EnhancedDoubleHasher( new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 5 } );
assertEquals( 1, hasher.getInitial() );
assertEquals( 2, hasher.getIncrement() );
// odd extra bytes are accounted for correctly
hasher = new EnhancedDoubleHasher( new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 1, 0, 0, 0, 0, 0, 0, 2, 5, 5 } );
assertEquals( 1, hasher.getInitial() );
assertEquals( 0x01_00_00_00_00_00_00_02L, hasher.getIncrement() );
// test empty buffer
assertThrows(IllegalArgumentException.class, () -> new EnhancedDoubleHasher(new byte[0]));
}
@Test
void testModEdgeCases() {
for (long dividend : new long[] { -1, -2, -3, -6378683, -23567468136887892L, Long.MIN_VALUE, 345, 678686,
67868768686878924L, Long.MAX_VALUE }) {
for (int divisor : new int[] { 1, 2, 3, 5, 13, Integer.MAX_VALUE }) {
assertEquals((int) Long.remainderUnsigned(dividend, divisor), EnhancedDoubleHasher.mod(dividend, divisor),
() -> String.format("failure with dividend=%s and divisor=%s.", dividend, divisor));
}
}
}
}

View File

@ -33,7 +33,7 @@ public class HasherCollectionTest extends AbstractHasherTest {
@Override
protected HasherCollection createHasher() {
return new HasherCollection(new SimpleHasher(1, 1), new SimpleHasher(2, 2));
return new HasherCollection(new IncrementingHasher(1, 1), new IncrementingHasher(2, 2));
}
@Override
@ -54,7 +54,7 @@ public class HasherCollectionTest extends AbstractHasherTest {
@Test
public void testCollectionConstructor() {
List<Hasher> lst = Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2));
List<Hasher> lst = Arrays.asList(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2));
HasherCollectionTest nestedTest = new HasherCollectionTest() {
@Override
protected HasherCollection createHasher() {
@ -71,7 +71,7 @@ public class HasherCollectionTest extends AbstractHasherTest {
nestedTest = new HasherCollectionTest() {
@Override
protected HasherCollection createHasher() {
return new HasherCollection(new SimpleHasher(3, 2), new SimpleHasher(4, 2));
return new HasherCollection(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2));
}
@Override
@ -85,10 +85,10 @@ public class HasherCollectionTest extends AbstractHasherTest {
@Test
public void testAdd() {
HasherCollection hasher = createHasher();
hasher.add(new SimpleHasher(2, 2));
hasher.add(new IncrementingHasher(2, 2));
assertEquals(3, hasher.getHashers().size());
hasher.add(Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2)));
hasher.add(Arrays.asList(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2)));
assertEquals(5, hasher.getHashers().size());
}
@ -97,7 +97,7 @@ public class HasherCollectionTest extends AbstractHasherTest {
// create a hasher that produces duplicates with the specified shape.
// this setup produces 5, 17, 29, 41, 53, 65 two times
Shape shape = Shape.fromKM(12, 72);
Hasher h1 = new SimpleHasher(5, 12);
Hasher h1 = new IncrementingHasher(5, 12);
HasherCollection hasher = createEmptyHasher();
hasher.add(h1);
hasher.add(h1);
@ -115,9 +115,9 @@ public class HasherCollectionTest extends AbstractHasherTest {
@Test
void testHasherCollection() {
Hasher h1 = new SimpleHasher(13, 4678);
Hasher h2 = new SimpleHasher(42, 987);
Hasher h3 = new SimpleHasher(454, 2342);
Hasher h1 = new IncrementingHasher(13, 4678);
Hasher h2 = new IncrementingHasher(42, 987);
Hasher h3 = new IncrementingHasher(454, 2342);
HasherCollection hc1 = new HasherCollection(Arrays.asList(h1, h1));
HasherCollection hc2 = new HasherCollection(Arrays.asList(h2, h3));

View File

@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import java.util.Objects;
import java.util.function.IntPredicate;
/**
* A Hasher that implements simple combinatorial hashing as as described by
* <a href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch and Mitzenmacher</a>.
*
* <p>To be used for testing only.</p>
*
* @since 4.5
*/
class IncrementingHasher implements Hasher {
/**
* The initial hash value.
*/
private final long initial;
/**
* The value to increment the hash value by.
*/
private final long increment;
/**
* Constructs the IncrementingHasher from 2 longs. The long values will be interpreted as unsigned values.
* <p>
* The initial hash value will be the modulus of the initial value.
* Subsequent values will be calculated by repeatedly adding the increment to the last value and taking the modulus.
* </p>
* @param initial The initial value for the hasher.
* @param increment The value to increment the hash by on each iteration.
*/
IncrementingHasher(long initial, long increment) {
this.initial = initial;
this.increment = increment;
}
@Override
public IndexProducer indices(final Shape shape) {
Objects.requireNonNull(shape, "shape");
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate consumer) {
Objects.requireNonNull(consumer, "consumer");
int bits = shape.getNumberOfBits();
// Essentially this is computing a wrapped modulus from a start point and an
// increment. So actually you only need two modulus operations before the loop.
// This avoids any modulus operation inside the while loop. It uses a long index
// to avoid overflow.
long index = EnhancedDoubleHasher.mod(initial, bits);
int inc = EnhancedDoubleHasher.mod(increment, bits);
for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) {
if (!consumer.test((int) index)) {
return false;
}
index += inc;
index = index >= bits ? index - bits : index;
}
return true;
}
@Override
public int[] asIndexArray() {
int[] result = new int[shape.getNumberOfHashFunctions()];
int[] idx = new int[1];
// This method needs to return duplicate indices
forEachIndex(i -> {
result[idx[0]++] = i;
return true;
});
return result;
}
};
}
}

View File

@ -23,7 +23,7 @@ public class IndexProducerFromArrayCountingBloomFilterTest extends AbstractIndex
@Override
protected IndexProducer createProducer() {
ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape);
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
filter.merge(hasher);
return filter;
}

View File

@ -20,7 +20,7 @@ public class IndexProducerFromHasherCollectionTest extends AbstractIndexProducer
@Override
protected IndexProducer createProducer() {
return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).indices(Shape.fromKM(17, 72));
return new HasherCollection(new IncrementingHasher(0, 1), new IncrementingHasher(0, 2)).indices(Shape.fromKM(17, 72));
}
@Override

View File

@ -20,7 +20,7 @@ public class IndexProducerFromHasherTest extends AbstractIndexProducerTest {
@Override
protected IndexProducer createProducer() {
return new SimpleHasher(0, 1).indices(Shape.fromKM(17, 72));
return new IncrementingHasher(0, 1).indices(Shape.fromKM(17, 72));
}
@Override

View File

@ -22,7 +22,7 @@ public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProduce
@Override
protected IndexProducer createProducer() {
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
return new SparseBloomFilter(shape, hasher);
}

View File

@ -22,7 +22,7 @@ public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProduce
@Override
protected IndexProducer createProducer() {
Hasher hasher = new SimpleHasher(0, 1);
Hasher hasher = new IncrementingHasher(0, 1);
return new SimpleBloomFilter(shape, hasher);
}

View File

@ -25,9 +25,9 @@ import org.junit.jupiter.api.Test;
*/
public class SetOperationsTest {
protected final SimpleHasher from1 = new SimpleHasher(1, 1);
protected final Hasher from1 = new IncrementingHasher(1, 1);
protected final long from1Value = 0x3FFFEL;
protected final SimpleHasher from11 = new SimpleHasher(11, 1);
protected final Hasher from11 = new IncrementingHasher(11, 1);
protected final long from11Value = 0xFFFF800L;
protected final HasherCollection bigHasher = new HasherCollection(from1, from11);
protected final long bigHashValue = 0xFFFFFFEL;
@ -49,7 +49,7 @@ public class SetOperationsTest {
Shape shape2 = Shape.fromKM(2, 72);
filter1 = new SimpleBloomFilter(shape2, from1);
filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1));
filter2 = new SimpleBloomFilter(shape2, new IncrementingHasher(2, 1));
int dotProduct = /* [1,2] & [2,3] = [2] = */ 1;
int cardinalityA = 2;

View File

@ -1,117 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.collections4.bloomfilter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.Test;
/**
* Tests the {@link SimpleHasher}.
*/
public class SimpleHasherTest extends AbstractHasherTest {
@Override
protected Hasher createHasher() {
return new SimpleHasher(1, 1);
}
@Override
protected Hasher createEmptyHasher() {
return NullHasher.INSTANCE;
}
@Override
protected int getHasherSize(Hasher hasher) {
return 1;
}
private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[] expected) {
SimpleHasher hasher = new SimpleHasher(buffer);
List<Integer> lst = new ArrayList<>();
IndexProducer producer = hasher.indices(shape);
producer.forEachIndex(lst::add);
assertEquals(expected.length, lst.size());
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], lst.get(i));
}
}
private void assertIncrement(SimpleHasher hasher, long defaultIncrement) {
assertEquals(defaultIncrement, hasher.getDefaultIncrement());
int[] values = hasher.indices(Shape.fromKM(2, Integer.MAX_VALUE)).asIndexArray();
assertEquals(0, values[0]);
assertEquals(Long.remainderUnsigned(defaultIncrement, Integer.MAX_VALUE), values[1]);
}
@Test
public void testConstructor() {
Shape shape = Shape.fromKM(5, 10);
assertConstructorBuffer(shape, new byte[] { 1, 1 }, new Integer[] { 1, 2, 3, 4, 5 });
assertConstructorBuffer(shape, new byte[] { 1 }, new Integer[] { 0, 1, 2, 3, 4 });
assertConstructorBuffer(shape, new byte[] { 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 });
assertConstructorBuffer(shape, new byte[] { 0, 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 });
assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 },
new Integer[] { 1, 2, 3, 4, 5 });
assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
new Integer[] { 1, 2, 3, 4, 5 });
assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
new Integer[] { 1, 2, 3, 4, 5 });
// test empty buffer
assertThrows(IllegalArgumentException.class, () -> new SimpleHasher(new byte[0]));
// test zero incrementer gets default
// default increment from SimpleHasher.
long defaultIncrement = 0x9e3779b97f4a7c15L;
SimpleHasher hasher = new SimpleHasher(0, 0);
assertIncrement(new SimpleHasher(0, 0), defaultIncrement);
assertIncrement(new SimpleHasher(new byte[2]), defaultIncrement);
// test that changing default increment works
defaultIncrement = 4;
defaultIncrement = 4L;
hasher = new SimpleHasher(0, 0) {
@Override
public long getDefaultIncrement() {
return 4L;
}
};
assertIncrement(hasher, defaultIncrement);
hasher = new SimpleHasher(new byte[2]) {
@Override
public long getDefaultIncrement() {
return 4L;
}
};
assertEquals(defaultIncrement, hasher.getDefaultIncrement());
}
@Test
void testModEdgeCases() {
for (long dividend : new long[] { -1, -2, -3, -6378683, -23567468136887892L, Long.MIN_VALUE, 345, 678686,
67868768686878924L, Long.MAX_VALUE }) {
for (int divisor : new int[] { 1, 2, 3, 5, 13, Integer.MAX_VALUE }) {
assertEquals((int) Long.remainderUnsigned(dividend, divisor), SimpleHasher.mod(dividend, divisor),
() -> String.format("failure with dividend=%s and divisor=%s.", dividend, divisor));
}
}
}
}

View File

@ -20,7 +20,7 @@ public class UniqueIndexProducerFromHasherCollectionTest extends AbstractIndexPr
@Override
protected IndexProducer createProducer() {
return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).uniqueIndices(Shape.fromKM(17, 72));
return new HasherCollection(new IncrementingHasher(0, 1), new IncrementingHasher(0, 2)).uniqueIndices(Shape.fromKM(17, 72));
}
@Override

View File

@ -20,7 +20,7 @@ public class UniqueIndexProducerFromHasherTest extends AbstractIndexProducerTest
@Override
protected IndexProducer createProducer() {
return new SimpleHasher(0, 1).uniqueIndices(Shape.fromKM(17, 72));
return new IncrementingHasher(0, 1).uniqueIndices(Shape.fromKM(17, 72));
}
@Override