diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java new file mode 100644 index 000000000..347a951a3 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Hasher that implements combinatorial hashing as as described by + * Krisch and Mitzenmacher using the enhanced double hashing technique + * described in the wikipedia article Double Hashing. + *

+ * Common use for this hasher is to generate bit indices from a byte array output of a hashing + * or MessageDigest algorithm.

+ * + *

Thoughts on the hasher input

+ * + *

Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than + * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be + * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits + * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for + * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial} + * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the + * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also + * ignores the negative wrapping but the behaviour is the same, some bits cannot be reached. + *

+ * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger + * than the number of bits then the modulus will create a 'random' position and increment within the size. + *

+ * + * @since 4.5 + */ +public class EnhancedDoubleHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Convert bytes to big-endian long filling with zero bytes as necessary.. + * @param byteArray the byte array to extract the values from. + * @param offset the offset to start extraction from. + * @param len the length of the extraction, may be longer than 8. + * @return + */ + private static long toLong(byte[] byteArray, int offset, int len) { + long val = 0; + int shift = Long.SIZE; + final int end = offset + Math.min(len, Long.BYTES); + for (int i = offset; i < end; i++) { + shift -= Byte.SIZE; + val |= ((long) (byteArray[i] & 0xFF) << shift); + } + return val; + } + + /** + * Constructs the EnhancedDoubleHasher from a byte array. + *

+ * This method simplifies the conversion from a Digest or hasher algorithm output + * to the two values used by the EnhancedDoubleHasher.

+ *

The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value. + * Excess bytes are ignored. + * If there are fewer than 16 bytes the following conversions are made. + *

+ *
    + *
  1. If there is an odd number of bytes the excess byte is assigned to the increment value
  2. + *
  3. The bytes alloted are read in big-endian order any byte not populated is set to zero.
  4. + *
+ *

+ * This ensures that small arrays generate the largest possible increment and initial values. + *

+ * @param buffer the buffer to extract the longs from. + * @throws IllegalArgumentException is buffer length is zero. + */ + public EnhancedDoubleHasher(byte[] buffer) { + if (buffer.length == 0) { + throw new IllegalArgumentException("buffer length must be greater than 0"); + } + // divide by 2 + int segment = buffer.length / 2; + this.initial = toLong(buffer, 0, segment); + this.increment = toLong(buffer, segment, buffer.length - segment); + } + + /** + * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values. + * @param initial The initial value for the hasher. + * @param increment The value to increment the hash by on each iteration. + */ + public EnhancedDoubleHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + /** + * Gets the initial value for the hash calculation. + * @return the initial value for the hash calculation. + */ + long getInitial() { + return initial; + } + + /** + * Gets the increment value for the hash calculation. + * @return the increment value for the hash calculation. + */ + long getIncrement() { + return increment; + } + + /** + * Performs a modulus calculation on an unsigned long and an integer divisor. + * @param dividend a unsigned long value to calculate the modulus of. + * @param divisor the divisor for the modulus calculation. + * @return the remainder or modulus value. + */ + static int mod(long dividend, int divisor) { + // See Hacker's Delight (2nd ed), section 9.3. + // Assume divisor is positive. + // Divide half the unsigned number and then double the quotient result. + final long quotient = ((dividend >>> 1) / divisor) << 1; + final long remainder = dividend - quotient * divisor; + // remainder in [0, 2 * divisor) + return (int) (remainder >= divisor ? remainder - divisor : remainder); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + final int bits = shape.getNumberOfBits(); + // Enhanced double hashing: + // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits + // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing + // + // Essentially this is computing a wrapped modulus from a start point and an + // increment and an additional term as a tetrahedral number. + // You only need two modulus operations before the loop. Within the loop + // the modulus is handled using the sign bit to detect wrapping to ensure: + // 0 <= index < bits + // 0 <= inc < bits + // The final hash is: + // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits) + + int index = mod(initial, bits); + int inc = mod(increment, bits); + + final int k = shape.getNumberOfHashFunctions(); + if (k > bits) { + for (int j = k; j > 0;) { + // handle k > bits + final int block = Math.min(j, bits); + j -= block; + for (int i = 0; i < block; i++) { + if (!consumer.test(index)) { + return false; + } + // Update index and handle wrapping + index -= inc; + index = index < 0 ? index + bits : index; + + // Incorporate the counter into the increment to create a + // tetrahedral number additional term, and handle wrapping. + inc -= i; + inc = inc < 0 ? inc + bits : inc; + } + } + } else { + for (int i = 0; i < k; i++) { + if (!consumer.test(index)) { + return false; + } + // Update index and handle wrapping + index -= inc; + index = index < 0 ? index + bits : index; + + // Incorporate the counter into the increment to create a + // tetrahedral number additional term, and handle wrapping. + inc -= i; + inc = inc < 0 ? inc + bits : inc; + } + } + return true; + } + + @Override + public int[] asIndexArray() { + int[] result = new int[shape.getNumberOfHashFunctions()]; + int[] idx = new int[1]; + + // This method needs to return duplicate indices + + forEachIndex(i -> { + result[idx[0]++] = i; + return true; + }); + return result; + } + }; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java index 6f2b5aab8..82445a623 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java @@ -16,6 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Objects; + /** * A Hasher creates IndexProducer based on the hash implementation and the * provided Shape. @@ -52,5 +54,10 @@ public interface Hasher { * @param shape the shape of the desired Bloom filter. * @return the iterator of integers */ - IndexProducer uniqueIndices(Shape shape); + default IndexProducer uniqueIndices(Shape shape) { + return consumer -> { + Objects.requireNonNull(consumer, "consumer"); + return Hasher.this.indices(shape).forEachIndex(IndexFilter.create(shape, consumer)); + }; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 40db56516..d39df255e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -186,20 +186,6 @@ public final class Shape implements Comparable { return -(m / k) * Math.log1p(-c / m); } - /** - * The factory to assist in the creation of proper Shapes. - * - * In the methods of this factory the `from` names are appended with the standard variable - * names in the order expected: - * - *
- *
{@code N})
The number of items to be placed in the Bloom filter
- *
{@code M})
The number of bits in the Bloom filter
- *
{@code K})
The number of hash functions for each item placed in the Bloom filter
- *
{@code P})
The probability of a collision once N items have been placed in the Bloom filter
- *
- */ - /** * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the * specified number of bits ({@code m}) and hash functions ({@code k}). diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java deleted file mode 100644 index 6c5056dc4..000000000 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleHasher.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import java.util.Objects; -import java.util.function.IntPredicate; - -/** - * A Hasher that implements combinatorial hashing as as described by - * Krisch and Mitzenmacher. - *

- * Common use for this hasher is to generate a byte array as the output of a hashing - * or MessageDigest algorithm.

- * - * @since 4.5 - */ -public class SimpleHasher implements Hasher { - - /** - * The initial hash value. - */ - private final long initial; - - /** - * The value to increment the hash value by. - */ - private final long increment; - - /** - * Convert bytes to long. - * @param byteArray the byte array to extract the values from. - * @param offset the offset to start extraction from. - * @param len the length of the extraction, may be longer than 8. - * @return - */ - private static long toLong(byte[] byteArray, int offset, int len) { - long val = 0; - len = Math.min(len, Long.BYTES); - for (int i = 0; i < len; i++) { - val <<= 8; - val |= (byteArray[offset + i] & 0x00FF); - } - return val; - } - - /** - * Constructs the SimpleHasher from a byte array. - *

The byte array is split in 2 and each half is interpreted as a long value. - * Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output - * to the two values used by the SimpleHasher.

- *

If the second long is zero the default increment is used instead.

- * @param buffer the buffer to extract the longs from. - * @throws IllegalArgumentException is buffer length is zero. - * @see #getDefaultIncrement() - */ - public SimpleHasher(byte[] buffer) { - if (buffer.length == 0) { - throw new IllegalArgumentException("buffer length must be greater than 0"); - } - int segment = buffer.length / 2; - this.initial = toLong(buffer, 0, segment); - long possibleIncrement = toLong(buffer, segment, buffer.length - segment); - this.increment = possibleIncrement == 0 ? getDefaultIncrement() : possibleIncrement; - } - - /** - * Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values. - *

If the increment is zero the default increment is used instead.

- * @param initial The initial value for the hasher. - * @param increment The value to increment the hash by on each iteration. - * @see #getDefaultIncrement() - */ - public SimpleHasher(long initial, long increment) { - this.initial = initial; - this.increment = increment == 0 ? getDefaultIncrement() : increment; - } - - /** - * Get the default increment used when the requested increment is zero. - *

- * By default this is the same - * default increment used in Java's SplittableRandom random number generator. It is the - * fractional representation of the golden ratio (0.618...) with a base of 2^64. - *

- * Implementations may want to override this value to match defaults in legacy implementations. - *

- * @return The default increment to use when the requested increment is zero. - */ - public long getDefaultIncrement() { - return 0x9e3779b97f4a7c15L; - } - - /** - * Performs a modulus calculation on an unsigned long and an integer divisor. - * @param dividend a unsigned long value to calculate the modulus of. - * @param divisor the divisor for the modulus calculation. - * @return the remainder or modulus value. - */ - static int mod(long dividend, int divisor) { - // See Hacker's Delight (2nd ed), section 9.3. - // Assume divisor is positive. - // Divide half the unsigned number and then double the quotient result. - final long quotient = ((dividend >>> 1) / divisor) << 1; - final long remainder = dividend - quotient * divisor; - // remainder in [0, 2 * divisor) - return (int) (remainder >= divisor ? remainder - divisor : remainder); - } - - @Override - public IndexProducer indices(final Shape shape) { - Objects.requireNonNull(shape, "shape"); - - return new IndexProducer() { - - @Override - public boolean forEachIndex(IntPredicate consumer) { - Objects.requireNonNull(consumer, "consumer"); - int bits = shape.getNumberOfBits(); - /* - * Essentially this is computing a wrapped modulus from a start point and an - * increment. So actually you only need two modulus operations before the loop. - * This avoids any modulus operation inside the while loop. It uses a long index - * to avoid overflow. - */ - long index = mod(initial, bits); - int inc = mod(increment, bits); - - for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { - - if (!consumer.test((int) index)) { - return false; - } - index += inc; - index = index >= bits ? index - bits : index; - } - return true; - } - - @Override - public int[] asIndexArray() { - int[] result = new int[shape.getNumberOfHashFunctions()]; - int[] idx = new int[1]; - /* - * This method needs to return duplicate indices - */ - forEachIndex(i -> { - result[idx[0]++] = i; - return true; - }); - return result; - } - }; - } - - @Override - public IndexProducer uniqueIndices(final Shape shape) { - return new IndexProducer() { - - @Override - public boolean forEachIndex(IntPredicate consumer) { - Objects.requireNonNull(consumer, "consumer"); - IntPredicate filter = IndexFilter.create(shape, consumer); - - int bits = shape.getNumberOfBits(); - - // Set up for the modulus. Use a long index to avoid overflow. - long index = mod(initial, bits); - int inc = mod(increment, bits); - - for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { - - if (!filter.test((int) index)) { - return false; - } - index += inc; - index = index >= bits ? index - bits : index; - } - return true; - } - }; - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 74b140c44..d8ceea079 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -30,15 +30,15 @@ import org.junit.jupiter.api.Test; */ public abstract class AbstractBloomFilterTest { - protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final Hasher from1 = new IncrementingHasher(1, 1); protected final long from1Value = 0x3fffeL; - protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final Hasher from11 = new IncrementingHasher(11, 1); protected final long from11Value = 0xffff800L; protected final HasherCollection bigHasher = new HasherCollection(from1, from11); protected final long bigHashValue = 0xffffffeL; - protected final HasherCollection fullHasher = new HasherCollection(new SimpleHasher(0, 1)/* 0-16 */, - new SimpleHasher(17, 1)/* 17-33 */, new SimpleHasher(33, 1)/* 33-49 */, new SimpleHasher(50, 1)/* 50-66 */, - new SimpleHasher(67, 1)/* 67-83 */ + protected final HasherCollection fullHasher = new HasherCollection(new IncrementingHasher(0, 1)/* 0-16 */, + new IncrementingHasher(17, 1)/* 17-33 */, new IncrementingHasher(33, 1)/* 33-49 */, new IncrementingHasher(50, 1)/* 50-66 */, + new IncrementingHasher(67, 1)/* 67-83 */ ); protected final long[] fullHashValue = { 0xffffffffffffffffL, 0xfffffL }; @@ -150,18 +150,18 @@ public abstract class AbstractBloomFilterTest { assertFalse(bf1.contains(bf2), "BF should not contain BF2"); assertTrue(bf2.contains(bf1), "BF2 should contain BF"); - assertTrue(bf2.contains(new SimpleHasher(1, 1)), "BF2 Should contain this hasher"); - assertFalse(bf2.contains(new SimpleHasher(1, 3)), "BF2 Should not contain this hasher"); + assertTrue(bf2.contains(new IncrementingHasher(1, 1)), "BF2 Should contain this hasher"); + assertFalse(bf2.contains(new IncrementingHasher(1, 3)), "BF2 Should not contain this hasher"); - IndexProducer indexProducer = new SimpleHasher(1, 1).indices(getTestShape()); + IndexProducer indexProducer = new IncrementingHasher(1, 1).indices(getTestShape()); assertTrue(bf2.contains(indexProducer), "BF2 Should contain this hasher"); - indexProducer = new SimpleHasher(1, 3).indices(getTestShape()); + indexProducer = new IncrementingHasher(1, 3).indices(getTestShape()); assertFalse(bf2.contains(indexProducer), "BF2 Should not contain this hasher"); - BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 1).indices(getTestShape()), + BitMapProducer bitMapProducer = BitMapProducer.fromIndexProducer(new IncrementingHasher(1, 1).indices(getTestShape()), getTestShape().getNumberOfBits()); assertTrue(bf2.contains(bitMapProducer), "BF2 Should contain this hasher"); - bitMapProducer = BitMapProducer.fromIndexProducer(new SimpleHasher(1, 3).indices(getTestShape()), + bitMapProducer = BitMapProducer.fromIndexProducer(new IncrementingHasher(1, 3).indices(getTestShape()), getTestShape().getNumberOfBits()); assertFalse(bf2.contains(bitMapProducer), "BF2 Should not contain this hasher"); @@ -228,11 +228,11 @@ public abstract class AbstractBloomFilterTest { // the data provided above do not generate an estimate that is equivalent to the // actual. - filter1.merge(new SimpleHasher(4, 1)); + filter1.merge(new IncrementingHasher(4, 1)); assertEquals(1, filter1.estimateN()); - filter1.merge(new SimpleHasher(17, 1)); + filter1.merge(new IncrementingHasher(17, 1)); assertEquals(3, filter1.estimateN()); } @@ -244,7 +244,7 @@ public abstract class AbstractBloomFilterTest { public final void testAsBitMapArray() { // test when multiple long values are returned. - final SimpleHasher hasher = new SimpleHasher(63, 1); + final IncrementingHasher hasher = new IncrementingHasher(63, 1); final BloomFilter bf = createFilter(Shape.fromKM(2, 72), hasher); final long[] lb = bf.asBitMapArray(); assertEquals(2, lb.length); @@ -265,7 +265,7 @@ public abstract class AbstractBloomFilterTest { filter = createFilter(getTestShape(), fullHasher); assertTrue(filter.isFull(), "Should be full"); - filter = createFilter(getTestShape(), new SimpleHasher(1, 3)); + filter = createFilter(getTestShape(), new IncrementingHasher(1, 3)); assertFalse(filter.isFull(), "Should not be full"); } @@ -313,12 +313,12 @@ public abstract class AbstractBloomFilterTest { // test error when bloom filter returns values out of range final BloomFilter bf5 = new SimpleBloomFilter( Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), - new SimpleHasher(Long.SIZE * 2, 1)); + new IncrementingHasher(Long.SIZE * 2, 1)); assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf5)); final BloomFilter bf6 = new SparseBloomFilter( Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), - new SimpleHasher(Long.SIZE * 2, 1)); + new IncrementingHasher(Long.SIZE * 2, 1)); assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf6)); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java index 02839df9b..b7ca7dd37 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -234,7 +234,7 @@ public abstract class AbstractCountingBloomFilterTest String.format("Did not produce k=%d * m=%d indices", k, getHasherSize(hasher))); + + // test early exit + count[0] = 0; + hasher.indices(Shape.fromKM(k, m)).forEachIndex(i -> { + assertTrue(i >= 0 && i < m, () -> "Out of range: " + i + ", m=" + m); + count[0]++; + return false; + }); + assertEquals(1, count[0], "did not exit early" ); } @Test public void testUniqueIndex() { - // create a hasher that produces duplicates with the specified shape. - // this setup produces 5, 17, 29, 41, 53, 65 two times - Shape shape = Shape.fromKM(12, 72); - Hasher hasher = new SimpleHasher(5, 12); - Set set = new HashSet<>(); - assertTrue(hasher.uniqueIndices(shape).forEachIndex(set::add), "Duplicate detected"); - assertEquals(6, set.size()); + // generating 11 numbers in the range of [0,9] will yield at least on collision. + Shape shape = Shape.fromKM(11, 10); + Hasher hasher = createHasher(); + IndexProducer producer = hasher.indices(shape); + List full = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList()); + producer = hasher.uniqueIndices(shape); + List unique = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList()); + assertTrue( full.size() > unique.size() ); + Set set = new HashSet( unique ); + assertEquals( set.size(), unique.size() ); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java index 3329bc4b5..7961b6d53 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitCountProducerFromArrayCountingBloomFilterTest.java @@ -23,7 +23,7 @@ public class BitCountProducerFromArrayCountingBloomFilterTest extends AbstractBi @Override protected BitCountProducer createProducer() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); filter.merge(hasher); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java index 48f5fb411..38c24af73 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromArrayCountingBloomFilterTest.java @@ -23,7 +23,7 @@ public class BitMapProducerFromArrayCountingBloomFilterTest extends AbstractBitM @Override protected BitMapProducer createProducer() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); filter.merge(hasher); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java index f73b4807b..aa000797e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSimpleBloomFilterTest.java @@ -22,7 +22,7 @@ public class BitMapProducerFromSimpleBloomFilterTest extends AbstractBitMapProdu @Override protected BitMapProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SimpleBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java index 0a6331ce7..0c80b6d0d 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/BitMapProducerFromSparseBloomFilterTest.java @@ -22,7 +22,7 @@ public class BitMapProducerFromSparseBloomFilterTest extends AbstractBitMapProdu @Override protected BitMapProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SparseBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java index 2c107fb01..26862bb19 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/DefaultBloomFilterTest.java @@ -52,7 +52,7 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest new EnhancedDoubleHasher(new byte[0])); + } + + @Test + void testModEdgeCases() { + for (long dividend : new long[] { -1, -2, -3, -6378683, -23567468136887892L, Long.MIN_VALUE, 345, 678686, + 67868768686878924L, Long.MAX_VALUE }) { + for (int divisor : new int[] { 1, 2, 3, 5, 13, Integer.MAX_VALUE }) { + assertEquals((int) Long.remainderUnsigned(dividend, divisor), EnhancedDoubleHasher.mod(dividend, divisor), + () -> String.format("failure with dividend=%s and divisor=%s.", dividend, divisor)); + } + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java index 29d3c5520..997ee01a2 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/HasherCollectionTest.java @@ -33,7 +33,7 @@ public class HasherCollectionTest extends AbstractHasherTest { @Override protected HasherCollection createHasher() { - return new HasherCollection(new SimpleHasher(1, 1), new SimpleHasher(2, 2)); + return new HasherCollection(new IncrementingHasher(1, 1), new IncrementingHasher(2, 2)); } @Override @@ -54,7 +54,7 @@ public class HasherCollectionTest extends AbstractHasherTest { @Test public void testCollectionConstructor() { - List lst = Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2)); + List lst = Arrays.asList(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2)); HasherCollectionTest nestedTest = new HasherCollectionTest() { @Override protected HasherCollection createHasher() { @@ -71,7 +71,7 @@ public class HasherCollectionTest extends AbstractHasherTest { nestedTest = new HasherCollectionTest() { @Override protected HasherCollection createHasher() { - return new HasherCollection(new SimpleHasher(3, 2), new SimpleHasher(4, 2)); + return new HasherCollection(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2)); } @Override @@ -85,10 +85,10 @@ public class HasherCollectionTest extends AbstractHasherTest { @Test public void testAdd() { HasherCollection hasher = createHasher(); - hasher.add(new SimpleHasher(2, 2)); + hasher.add(new IncrementingHasher(2, 2)); assertEquals(3, hasher.getHashers().size()); - hasher.add(Arrays.asList(new SimpleHasher(3, 2), new SimpleHasher(4, 2))); + hasher.add(Arrays.asList(new IncrementingHasher(3, 2), new IncrementingHasher(4, 2))); assertEquals(5, hasher.getHashers().size()); } @@ -97,7 +97,7 @@ public class HasherCollectionTest extends AbstractHasherTest { // create a hasher that produces duplicates with the specified shape. // this setup produces 5, 17, 29, 41, 53, 65 two times Shape shape = Shape.fromKM(12, 72); - Hasher h1 = new SimpleHasher(5, 12); + Hasher h1 = new IncrementingHasher(5, 12); HasherCollection hasher = createEmptyHasher(); hasher.add(h1); hasher.add(h1); @@ -115,9 +115,9 @@ public class HasherCollectionTest extends AbstractHasherTest { @Test void testHasherCollection() { - Hasher h1 = new SimpleHasher(13, 4678); - Hasher h2 = new SimpleHasher(42, 987); - Hasher h3 = new SimpleHasher(454, 2342); + Hasher h1 = new IncrementingHasher(13, 4678); + Hasher h2 = new IncrementingHasher(42, 987); + Hasher h3 = new IncrementingHasher(454, 2342); HasherCollection hc1 = new HasherCollection(Arrays.asList(h1, h1)); HasherCollection hc2 = new HasherCollection(Arrays.asList(h2, h3)); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IncrementingHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IncrementingHasher.java new file mode 100644 index 000000000..bc4003f18 --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IncrementingHasher.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Hasher that implements simple combinatorial hashing as as described by + * Krisch and Mitzenmacher. + * + *

To be used for testing only.

+ * + * @since 4.5 + */ +class IncrementingHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Constructs the IncrementingHasher from 2 longs. The long values will be interpreted as unsigned values. + *

+ * The initial hash value will be the modulus of the initial value. + * Subsequent values will be calculated by repeatedly adding the increment to the last value and taking the modulus. + *

+ * @param initial The initial value for the hasher. + * @param increment The value to increment the hash by on each iteration. + */ + IncrementingHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + int bits = shape.getNumberOfBits(); + + // Essentially this is computing a wrapped modulus from a start point and an + // increment. So actually you only need two modulus operations before the loop. + // This avoids any modulus operation inside the while loop. It uses a long index + // to avoid overflow. + + long index = EnhancedDoubleHasher.mod(initial, bits); + int inc = EnhancedDoubleHasher.mod(increment, bits); + + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + if (!consumer.test((int) index)) { + return false; + } + index += inc; + index = index >= bits ? index - bits : index; + } + return true; + } + + @Override + public int[] asIndexArray() { + int[] result = new int[shape.getNumberOfHashFunctions()]; + int[] idx = new int[1]; + + // This method needs to return duplicate indices + + forEachIndex(i -> { + result[idx[0]++] = i; + return true; + }); + return result; + } + }; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java index 383cf3861..0ea2c6079 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java @@ -23,7 +23,7 @@ public class IndexProducerFromArrayCountingBloomFilterTest extends AbstractIndex @Override protected IndexProducer createProducer() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); filter.merge(hasher); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java index d7e61d796..1376a4bfc 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java @@ -20,7 +20,7 @@ public class IndexProducerFromHasherCollectionTest extends AbstractIndexProducer @Override protected IndexProducer createProducer() { - return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).indices(Shape.fromKM(17, 72)); + return new HasherCollection(new IncrementingHasher(0, 1), new IncrementingHasher(0, 2)).indices(Shape.fromKM(17, 72)); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java index c089b4b42..683b3705e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java @@ -20,7 +20,7 @@ public class IndexProducerFromHasherTest extends AbstractIndexProducerTest { @Override protected IndexProducer createProducer() { - return new SimpleHasher(0, 1).indices(Shape.fromKM(17, 72)); + return new IncrementingHasher(0, 1).indices(Shape.fromKM(17, 72)); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java index 852542867..d62ac959e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java @@ -22,7 +22,7 @@ public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SparseBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java index 4204c90fe..b51e01b40 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java @@ -22,7 +22,7 @@ public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SimpleBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 9d7659d1f..f958cdcc9 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -25,9 +25,9 @@ import org.junit.jupiter.api.Test; */ public class SetOperationsTest { - protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final Hasher from1 = new IncrementingHasher(1, 1); protected final long from1Value = 0x3FFFEL; - protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final Hasher from11 = new IncrementingHasher(11, 1); protected final long from11Value = 0xFFFF800L; protected final HasherCollection bigHasher = new HasherCollection(from1, from11); protected final long bigHashValue = 0xFFFFFFEL; @@ -49,7 +49,7 @@ public class SetOperationsTest { Shape shape2 = Shape.fromKM(2, 72); filter1 = new SimpleBloomFilter(shape2, from1); - filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1)); + filter2 = new SimpleBloomFilter(shape2, new IncrementingHasher(2, 1)); int dotProduct = /* [1,2] & [2,3] = [2] = */ 1; int cardinalityA = 2; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java deleted file mode 100644 index cb52bf80a..000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; -import java.util.ArrayList; -import java.util.List; -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link SimpleHasher}. - */ -public class SimpleHasherTest extends AbstractHasherTest { - - @Override - protected Hasher createHasher() { - return new SimpleHasher(1, 1); - } - - @Override - protected Hasher createEmptyHasher() { - return NullHasher.INSTANCE; - } - - @Override - protected int getHasherSize(Hasher hasher) { - return 1; - } - - private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[] expected) { - SimpleHasher hasher = new SimpleHasher(buffer); - List lst = new ArrayList<>(); - IndexProducer producer = hasher.indices(shape); - producer.forEachIndex(lst::add); - assertEquals(expected.length, lst.size()); - for (int i = 0; i < expected.length; i++) { - assertEquals(expected[i], lst.get(i)); - } - } - - private void assertIncrement(SimpleHasher hasher, long defaultIncrement) { - assertEquals(defaultIncrement, hasher.getDefaultIncrement()); - int[] values = hasher.indices(Shape.fromKM(2, Integer.MAX_VALUE)).asIndexArray(); - assertEquals(0, values[0]); - assertEquals(Long.remainderUnsigned(defaultIncrement, Integer.MAX_VALUE), values[1]); - } - - @Test - public void testConstructor() { - Shape shape = Shape.fromKM(5, 10); - assertConstructorBuffer(shape, new byte[] { 1, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); - assertConstructorBuffer(shape, new byte[] { 1 }, new Integer[] { 0, 1, 2, 3, 4 }); - assertConstructorBuffer(shape, new byte[] { 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); - assertConstructorBuffer(shape, new byte[] { 0, 1, 0, 1 }, new Integer[] { 1, 2, 3, 4, 5 }); - assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1 }, - new Integer[] { 1, 2, 3, 4, 5 }); - assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }, - new Integer[] { 1, 2, 3, 4, 5 }); - assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 }, - new Integer[] { 1, 2, 3, 4, 5 }); - - // test empty buffer - assertThrows(IllegalArgumentException.class, () -> new SimpleHasher(new byte[0])); - - // test zero incrementer gets default - // default increment from SimpleHasher. - long defaultIncrement = 0x9e3779b97f4a7c15L; - SimpleHasher hasher = new SimpleHasher(0, 0); - assertIncrement(new SimpleHasher(0, 0), defaultIncrement); - assertIncrement(new SimpleHasher(new byte[2]), defaultIncrement); - - // test that changing default increment works - defaultIncrement = 4; - defaultIncrement = 4L; - hasher = new SimpleHasher(0, 0) { - @Override - public long getDefaultIncrement() { - return 4L; - } - }; - assertIncrement(hasher, defaultIncrement); - hasher = new SimpleHasher(new byte[2]) { - @Override - public long getDefaultIncrement() { - return 4L; - } - }; - - assertEquals(defaultIncrement, hasher.getDefaultIncrement()); - } - - @Test - void testModEdgeCases() { - for (long dividend : new long[] { -1, -2, -3, -6378683, -23567468136887892L, Long.MIN_VALUE, 345, 678686, - 67868768686878924L, Long.MAX_VALUE }) { - for (int divisor : new int[] { 1, 2, 3, 5, 13, Integer.MAX_VALUE }) { - assertEquals((int) Long.remainderUnsigned(dividend, divisor), SimpleHasher.mod(dividend, divisor), - () -> String.format("failure with dividend=%s and divisor=%s.", dividend, divisor)); - } - } - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java index 4aaf9141a..99ef6f200 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherCollectionTest.java @@ -20,7 +20,7 @@ public class UniqueIndexProducerFromHasherCollectionTest extends AbstractIndexPr @Override protected IndexProducer createProducer() { - return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).uniqueIndices(Shape.fromKM(17, 72)); + return new HasherCollection(new IncrementingHasher(0, 1), new IncrementingHasher(0, 2)).uniqueIndices(Shape.fromKM(17, 72)); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java index f711a5720..84c17b60f 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/UniqueIndexProducerFromHasherTest.java @@ -20,7 +20,7 @@ public class UniqueIndexProducerFromHasherTest extends AbstractIndexProducerTest @Override protected IndexProducer createProducer() { - return new SimpleHasher(0, 1).uniqueIndices(Shape.fromKM(17, 72)); + return new IncrementingHasher(0, 1).uniqueIndices(Shape.fromKM(17, 72)); } @Override