diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java new file mode 100644 index 000000000..347a951a3 --- /dev/null +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/EnhancedDoubleHasher.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Hasher that implements combinatorial hashing as as described by + * Krisch and Mitzenmacher using the enhanced double hashing technique + * described in the wikipedia article Double Hashing. + *
+ * Common use for this hasher is to generate bit indices from a byte array output of a hashing + * or MessageDigest algorithm.
+ * + *Note that it is worse to create smaller numbers for the {@code initial} and {@code increment}. If the {@code initial} is smaller than + * the number of bits in a filter then hashing will start at the same point when the size increases; likewise the {@code increment} will be + * the same if it remains smaller than the number of bits in the filter and so the first few indices will be the same if the number of bits + * changes (but is still larger than the {@code increment}). In a worse case scenario with small {@code initial} and {@code increment} for + * all items, hashing may not create indices that fill the full region within a much larger filter. Imagine hashers created with {@code initial} + * and {@code increment} values less than 255 with a filter size of 30000 and number of hash functions as 5. Ignoring the + * tetrahedral addition (a maximum of 20 for k=5) the max index is 255 * 4 + 255 = 1275, this covers 4.25% of the filter. This also + * ignores the negative wrapping but the behaviour is the same, some bits cannot be reached. + *
+ * So this needs to be avoided as the filter probability assumptions will be void. If the {@code initial} and {@code increment} are larger + * than the number of bits then the modulus will create a 'random' position and increment within the size. + *
+ * + * @since 4.5 + */ +public class EnhancedDoubleHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Convert bytes to big-endian long filling with zero bytes as necessary.. + * @param byteArray the byte array to extract the values from. + * @param offset the offset to start extraction from. + * @param len the length of the extraction, may be longer than 8. + * @return + */ + private static long toLong(byte[] byteArray, int offset, int len) { + long val = 0; + int shift = Long.SIZE; + final int end = offset + Math.min(len, Long.BYTES); + for (int i = offset; i < end; i++) { + shift -= Byte.SIZE; + val |= ((long) (byteArray[i] & 0xFF) << shift); + } + return val; + } + + /** + * Constructs the EnhancedDoubleHasher from a byte array. + *+ * This method simplifies the conversion from a Digest or hasher algorithm output + * to the two values used by the EnhancedDoubleHasher.
+ *The byte array is split in 2 and the first 8 bytes of each half are interpreted as a big-endian long value. + * Excess bytes are ignored. + * If there are fewer than 16 bytes the following conversions are made. + *
+ *+ * This ensures that small arrays generate the largest possible increment and initial values. + *
+ * @param buffer the buffer to extract the longs from. + * @throws IllegalArgumentException is buffer length is zero. + */ + public EnhancedDoubleHasher(byte[] buffer) { + if (buffer.length == 0) { + throw new IllegalArgumentException("buffer length must be greater than 0"); + } + // divide by 2 + int segment = buffer.length / 2; + this.initial = toLong(buffer, 0, segment); + this.increment = toLong(buffer, segment, buffer.length - segment); + } + + /** + * Constructs the EnhancedDoubleHasher from 2 longs. The long values will be interpreted as unsigned values. + * @param initial The initial value for the hasher. + * @param increment The value to increment the hash by on each iteration. + */ + public EnhancedDoubleHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + /** + * Gets the initial value for the hash calculation. + * @return the initial value for the hash calculation. + */ + long getInitial() { + return initial; + } + + /** + * Gets the increment value for the hash calculation. + * @return the increment value for the hash calculation. + */ + long getIncrement() { + return increment; + } + + /** + * Performs a modulus calculation on an unsigned long and an integer divisor. + * @param dividend a unsigned long value to calculate the modulus of. + * @param divisor the divisor for the modulus calculation. + * @return the remainder or modulus value. + */ + static int mod(long dividend, int divisor) { + // See Hacker's Delight (2nd ed), section 9.3. + // Assume divisor is positive. + // Divide half the unsigned number and then double the quotient result. + final long quotient = ((dividend >>> 1) / divisor) << 1; + final long remainder = dividend - quotient * divisor; + // remainder in [0, 2 * divisor) + return (int) (remainder >= divisor ? remainder - divisor : remainder); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + final int bits = shape.getNumberOfBits(); + // Enhanced double hashing: + // hash[i] = ( h1(x) + i*h2(x) + (i*i*i - i)/6 ) mod bits + // See: https://en.wikipedia.org/wiki/Double_hashing#Enhanced_double_hashing + // + // Essentially this is computing a wrapped modulus from a start point and an + // increment and an additional term as a tetrahedral number. + // You only need two modulus operations before the loop. Within the loop + // the modulus is handled using the sign bit to detect wrapping to ensure: + // 0 <= index < bits + // 0 <= inc < bits + // The final hash is: + // hash[i] = ( h1(x) - i*h2(x) - (i*i*i - i)/6 ) wrapped in [0, bits) + + int index = mod(initial, bits); + int inc = mod(increment, bits); + + final int k = shape.getNumberOfHashFunctions(); + if (k > bits) { + for (int j = k; j > 0;) { + // handle k > bits + final int block = Math.min(j, bits); + j -= block; + for (int i = 0; i < block; i++) { + if (!consumer.test(index)) { + return false; + } + // Update index and handle wrapping + index -= inc; + index = index < 0 ? index + bits : index; + + // Incorporate the counter into the increment to create a + // tetrahedral number additional term, and handle wrapping. + inc -= i; + inc = inc < 0 ? inc + bits : inc; + } + } + } else { + for (int i = 0; i < k; i++) { + if (!consumer.test(index)) { + return false; + } + // Update index and handle wrapping + index -= inc; + index = index < 0 ? index + bits : index; + + // Incorporate the counter into the increment to create a + // tetrahedral number additional term, and handle wrapping. + inc -= i; + inc = inc < 0 ? inc + bits : inc; + } + } + return true; + } + + @Override + public int[] asIndexArray() { + int[] result = new int[shape.getNumberOfHashFunctions()]; + int[] idx = new int[1]; + + // This method needs to return duplicate indices + + forEachIndex(i -> { + result[idx[0]++] = i; + return true; + }); + return result; + } + }; + } +} diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java index 6f2b5aab8..82445a623 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Hasher.java @@ -16,6 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Objects; + /** * A Hasher creates IndexProducer based on the hash implementation and the * provided Shape. @@ -52,5 +54,10 @@ public interface Hasher { * @param shape the shape of the desired Bloom filter. * @return the iterator of integers */ - IndexProducer uniqueIndices(Shape shape); + default IndexProducer uniqueIndices(Shape shape) { + return consumer -> { + Objects.requireNonNull(consumer, "consumer"); + return Hasher.this.indices(shape).forEachIndex(IndexFilter.create(shape, consumer)); + }; + } } diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java index 40db56516..d39df255e 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java @@ -186,20 +186,6 @@ public final class Shape implements Comparable- * Common use for this hasher is to generate a byte array as the output of a hashing - * or MessageDigest algorithm.
- * - * @since 4.5 - */ -public class SimpleHasher implements Hasher { - - /** - * The initial hash value. - */ - private final long initial; - - /** - * The value to increment the hash value by. - */ - private final long increment; - - /** - * Convert bytes to long. - * @param byteArray the byte array to extract the values from. - * @param offset the offset to start extraction from. - * @param len the length of the extraction, may be longer than 8. - * @return - */ - private static long toLong(byte[] byteArray, int offset, int len) { - long val = 0; - len = Math.min(len, Long.BYTES); - for (int i = 0; i < len; i++) { - val <<= 8; - val |= (byteArray[offset + i] & 0x00FF); - } - return val; - } - - /** - * Constructs the SimpleHasher from a byte array. - *The byte array is split in 2 and each half is interpreted as a long value. - * Excess bytes are ignored. This simplifies the conversion from a Digest or hasher algorithm output - * to the two values used by the SimpleHasher.
- *If the second long is zero the default increment is used instead.
- * @param buffer the buffer to extract the longs from. - * @throws IllegalArgumentException is buffer length is zero. - * @see #getDefaultIncrement() - */ - public SimpleHasher(byte[] buffer) { - if (buffer.length == 0) { - throw new IllegalArgumentException("buffer length must be greater than 0"); - } - int segment = buffer.length / 2; - this.initial = toLong(buffer, 0, segment); - long possibleIncrement = toLong(buffer, segment, buffer.length - segment); - this.increment = possibleIncrement == 0 ? getDefaultIncrement() : possibleIncrement; - } - - /** - * Constructs the SimpleHasher from 2 longs. The long values will be interpreted as unsigned values. - *If the increment is zero the default increment is used instead.
- * @param initial The initial value for the hasher. - * @param increment The value to increment the hash by on each iteration. - * @see #getDefaultIncrement() - */ - public SimpleHasher(long initial, long increment) { - this.initial = initial; - this.increment = increment == 0 ? getDefaultIncrement() : increment; - } - - /** - * Get the default increment used when the requested increment is zero. - *- * By default this is the same - * default increment used in Java's SplittableRandom random number generator. It is the - * fractional representation of the golden ratio (0.618...) with a base of 2^64. - *
- * Implementations may want to override this value to match defaults in legacy implementations. - *
- * @return The default increment to use when the requested increment is zero. - */ - public long getDefaultIncrement() { - return 0x9e3779b97f4a7c15L; - } - - /** - * Performs a modulus calculation on an unsigned long and an integer divisor. - * @param dividend a unsigned long value to calculate the modulus of. - * @param divisor the divisor for the modulus calculation. - * @return the remainder or modulus value. - */ - static int mod(long dividend, int divisor) { - // See Hacker's Delight (2nd ed), section 9.3. - // Assume divisor is positive. - // Divide half the unsigned number and then double the quotient result. - final long quotient = ((dividend >>> 1) / divisor) << 1; - final long remainder = dividend - quotient * divisor; - // remainder in [0, 2 * divisor) - return (int) (remainder >= divisor ? remainder - divisor : remainder); - } - - @Override - public IndexProducer indices(final Shape shape) { - Objects.requireNonNull(shape, "shape"); - - return new IndexProducer() { - - @Override - public boolean forEachIndex(IntPredicate consumer) { - Objects.requireNonNull(consumer, "consumer"); - int bits = shape.getNumberOfBits(); - /* - * Essentially this is computing a wrapped modulus from a start point and an - * increment. So actually you only need two modulus operations before the loop. - * This avoids any modulus operation inside the while loop. It uses a long index - * to avoid overflow. - */ - long index = mod(initial, bits); - int inc = mod(increment, bits); - - for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { - - if (!consumer.test((int) index)) { - return false; - } - index += inc; - index = index >= bits ? index - bits : index; - } - return true; - } - - @Override - public int[] asIndexArray() { - int[] result = new int[shape.getNumberOfHashFunctions()]; - int[] idx = new int[1]; - /* - * This method needs to return duplicate indices - */ - forEachIndex(i -> { - result[idx[0]++] = i; - return true; - }); - return result; - } - }; - } - - @Override - public IndexProducer uniqueIndices(final Shape shape) { - return new IndexProducer() { - - @Override - public boolean forEachIndex(IntPredicate consumer) { - Objects.requireNonNull(consumer, "consumer"); - IntPredicate filter = IndexFilter.create(shape, consumer); - - int bits = shape.getNumberOfBits(); - - // Set up for the modulus. Use a long index to avoid overflow. - long index = mod(initial, bits); - int inc = mod(increment, bits); - - for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { - - if (!filter.test((int) index)) { - return false; - } - index += inc; - index = index >= bits ? index - bits : index; - } - return true; - } - }; - } -} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index 74b140c44..d8ceea079 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -30,15 +30,15 @@ import org.junit.jupiter.api.Test; */ public abstract class AbstractBloomFilterTestTo be used for testing only.
+ * + * @since 4.5 + */ +class IncrementingHasher implements Hasher { + + /** + * The initial hash value. + */ + private final long initial; + + /** + * The value to increment the hash value by. + */ + private final long increment; + + /** + * Constructs the IncrementingHasher from 2 longs. The long values will be interpreted as unsigned values. + *+ * The initial hash value will be the modulus of the initial value. + * Subsequent values will be calculated by repeatedly adding the increment to the last value and taking the modulus. + *
+ * @param initial The initial value for the hasher. + * @param increment The value to increment the hash by on each iteration. + */ + IncrementingHasher(long initial, long increment) { + this.initial = initial; + this.increment = increment; + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + int bits = shape.getNumberOfBits(); + + // Essentially this is computing a wrapped modulus from a start point and an + // increment. So actually you only need two modulus operations before the loop. + // This avoids any modulus operation inside the while loop. It uses a long index + // to avoid overflow. + + long index = EnhancedDoubleHasher.mod(initial, bits); + int inc = EnhancedDoubleHasher.mod(increment, bits); + + for (int functionalCount = 0; functionalCount < shape.getNumberOfHashFunctions(); functionalCount++) { + if (!consumer.test((int) index)) { + return false; + } + index += inc; + index = index >= bits ? index - bits : index; + } + return true; + } + + @Override + public int[] asIndexArray() { + int[] result = new int[shape.getNumberOfHashFunctions()]; + int[] idx = new int[1]; + + // This method needs to return duplicate indices + + forEachIndex(i -> { + result[idx[0]++] = i; + return true; + }); + return result; + } + }; + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java index 383cf3861..0ea2c6079 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromArrayCountingBloomFilterTest.java @@ -23,7 +23,7 @@ public class IndexProducerFromArrayCountingBloomFilterTest extends AbstractIndex @Override protected IndexProducer createProducer() { ArrayCountingBloomFilter filter = new ArrayCountingBloomFilter(shape); - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); filter.merge(hasher); return filter; } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java index d7e61d796..1376a4bfc 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherCollectionTest.java @@ -20,7 +20,7 @@ public class IndexProducerFromHasherCollectionTest extends AbstractIndexProducer @Override protected IndexProducer createProducer() { - return new HasherCollection(new SimpleHasher(0, 1), new SimpleHasher(0, 2)).indices(Shape.fromKM(17, 72)); + return new HasherCollection(new IncrementingHasher(0, 1), new IncrementingHasher(0, 2)).indices(Shape.fromKM(17, 72)); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java index c089b4b42..683b3705e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromHasherTest.java @@ -20,7 +20,7 @@ public class IndexProducerFromHasherTest extends AbstractIndexProducerTest { @Override protected IndexProducer createProducer() { - return new SimpleHasher(0, 1).indices(Shape.fromKM(17, 72)); + return new IncrementingHasher(0, 1).indices(Shape.fromKM(17, 72)); } @Override diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java index 852542867..d62ac959e 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java @@ -22,7 +22,7 @@ public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SparseBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java index 4204c90fe..b51e01b40 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java @@ -22,7 +22,7 @@ public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { - Hasher hasher = new SimpleHasher(0, 1); + Hasher hasher = new IncrementingHasher(0, 1); return new SimpleBloomFilter(shape, hasher); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 9d7659d1f..f958cdcc9 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -25,9 +25,9 @@ import org.junit.jupiter.api.Test; */ public class SetOperationsTest { - protected final SimpleHasher from1 = new SimpleHasher(1, 1); + protected final Hasher from1 = new IncrementingHasher(1, 1); protected final long from1Value = 0x3FFFEL; - protected final SimpleHasher from11 = new SimpleHasher(11, 1); + protected final Hasher from11 = new IncrementingHasher(11, 1); protected final long from11Value = 0xFFFF800L; protected final HasherCollection bigHasher = new HasherCollection(from1, from11); protected final long bigHashValue = 0xFFFFFFEL; @@ -49,7 +49,7 @@ public class SetOperationsTest { Shape shape2 = Shape.fromKM(2, 72); filter1 = new SimpleBloomFilter(shape2, from1); - filter2 = new SimpleBloomFilter(shape2, new SimpleHasher(2, 1)); + filter2 = new SimpleBloomFilter(shape2, new IncrementingHasher(2, 1)); int dotProduct = /* [1,2] & [2,3] = [2] = */ 1; int cardinalityA = 2; diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java deleted file mode 100644 index cb52bf80a..000000000 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleHasherTest.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.commons.collections4.bloomfilter; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertThrows; -import java.util.ArrayList; -import java.util.List; -import org.junit.jupiter.api.Test; - -/** - * Tests the {@link SimpleHasher}. - */ -public class SimpleHasherTest extends AbstractHasherTest { - - @Override - protected Hasher createHasher() { - return new SimpleHasher(1, 1); - } - - @Override - protected Hasher createEmptyHasher() { - return NullHasher.INSTANCE; - } - - @Override - protected int getHasherSize(Hasher hasher) { - return 1; - } - - private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[] expected) { - SimpleHasher hasher = new SimpleHasher(buffer); - List