From 9a58c1bbdf7c65e71b33ab1ba406f1de5a8191df Mon Sep 17 00:00:00 2001 From: "Claude Warren, Jr" Date: Fri, 9 Sep 2022 22:18:32 +0100 Subject: [PATCH] Collections-763: Remove BloomFilter constructors that create initial entry --- .../bloomfilter/ArrayCountingBloomFilter.java | 33 ----- .../collections4/bloomfilter/BloomFilter.java | 41 +++++- .../bloomfilter/CountingBloomFilter.java | 137 ++++++++++++++++- .../bloomfilter/SimpleBloomFilter.java | 79 ++-------- .../bloomfilter/SparseBloomFilter.java | 82 ++--------- .../bloomfilter/package-info.java | 88 +++++------ .../bloomfilter/AbstractBloomFilterTest.java | 131 +++++++++++------ .../AbstractCountingBloomFilterTest.java | 68 ++++++--- .../bloomfilter/AbstractHasherTest.java | 8 +- .../ArrayCountingBloomFilterTest.java | 24 --- .../collections4/bloomfilter/ArrayHasher.java | 65 +++++++++ ...tMapProducerFromSimpleBloomFilterTest.java | 4 +- ...tMapProducerFromSparseBloomFilterTest.java | 4 +- .../DefaultBitMapProducerTest.java | 42 +++++- .../bloomfilter/DefaultBloomFilterTest.java | 86 +++-------- .../bloomfilter/DefaultIndexProducerTest.java | 113 ++++++++++++++ ...ndexProducerFromSimpleBloomFilterTest.java | 6 +- ...ndexProducerFromSparseBloomFilterTest.java | 7 +- .../bloomfilter/SetOperationsTest.java | 138 ++++++++++-------- .../bloomfilter/SimpleBloomFilterTest.java | 60 -------- .../bloomfilter/SparseBloomFilterTest.java | 63 +------- 21 files changed, 701 insertions(+), 578 deletions(-) create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java create mode 100644 src/test/java/org/apache/commons/collections4/bloomfilter/DefaultIndexProducerTest.java diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java index ba13e6645..e76ddda44 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java @@ -125,39 +125,6 @@ public final class ArrayCountingBloomFilter implements CountingBloomFilter { return (int) IntStream.range(0, counts.length).filter(i -> counts[i] > 0).count(); } - @Override - public boolean merge(final BloomFilter other) { - Objects.requireNonNull(other, "other"); - try { - return add(BitCountProducer.from(other)); - } catch (IndexOutOfBoundsException e) { - throw new IllegalArgumentException( e ); - } - } - - @Override - public boolean merge(final Hasher hasher) { - Objects.requireNonNull(hasher, "hasher"); - try { - return add(BitCountProducer.from(hasher.uniqueIndices(shape))); - } catch (IndexOutOfBoundsException e) { - throw new IllegalArgumentException( - String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); - } - } - - @Override - public boolean remove(final BloomFilter other) { - Objects.requireNonNull(other, "other"); - return subtract(BitCountProducer.from(other)); - } - - @Override - public boolean remove(final Hasher hasher) { - Objects.requireNonNull(hasher, "hasher"); - return subtract(BitCountProducer.from(hasher.uniqueIndices(shape))); - } - @Override public boolean add(final BitCountProducer other) { Objects.requireNonNull(other, "other"); diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java index 9471e5dcd..f939538b0 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java @@ -135,7 +135,9 @@ public interface BloomFilter extends IndexProducer, BitMapProducer { * @param other The bloom filter to merge into this one. * @return true if the merge was successful */ - boolean merge(BloomFilter other); + default boolean merge(BloomFilter other) { + return (characteristics() & SPARSE) != 0 ? merge((IndexProducer) other ) : merge((BitMapProducer) other); + } /** * Merges the specified hasher into this Bloom filter. Specifically all @@ -143,7 +145,7 @@ public interface BloomFilter extends IndexProducer, BitMapProducer { * *

Note: This method should return {@code true} even if no additional bit indexes were * enabled. A {@code false} result indicates that this filter may or may not contain - * the {@code other} Bloom filter. This state may occur in complex Bloom filter implementations like + * the {@code hasher} values. This state may occur in complex Bloom filter implementations like * counting Bloom filters.

* * @param hasher The hasher to merge. @@ -151,12 +153,39 @@ public interface BloomFilter extends IndexProducer, BitMapProducer { */ default boolean merge(Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - Shape shape = getShape(); - // create the Bloom filter that is most likely to merge quickly with this one - BloomFilter result = (characteristics() & SPARSE) != 0 ? new SparseBloomFilter(shape, hasher) : new SimpleBloomFilter(shape, hasher); - return merge(result); + return merge(hasher.indices(getShape())); } + /** + * Merges the specified IndexProducer into this Bloom filter. Specifically all + * bit indexes that are identified by the {@code producer} will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes of + * the {@code producer}. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

+ * + * @param indexProducer The IndexProducer to merge. + * @return true if the merge was successful + * @throws IllegalArgumentException if producer sends illegal value. + */ + boolean merge(IndexProducer indexProducer); + + /** + * Merges the specified hasher into this Bloom filter. Specifically all + * bit indexes that are identified by the {@code producer} will be enabled in this filter. + * + *

Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain all the indexes + * enabled in the {@code producer}. This state may occur in complex Bloom filter implementations like + * counting Bloom filters.

+ * + * @param bitMapProducer The producer to merge. + * @return true if the merge was successful + * @throws IllegalArgumentException if producer sends illegal value. + */ + boolean merge(BitMapProducer bitMapProducer); + // Counting Operations /** diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index 186c284e9..1d6f65cf5 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -16,6 +16,8 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.Objects; + /** * The interface that describes a Bloom filter that associates a count with each * bit index to allow reversal of merge operations with remove operations. @@ -77,6 +79,84 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { // Modification Operations + /** + * Merges the specified Bloom filter into this Bloom filter. + * + *

Specifically: all counts for the indexes identified by the {@code other} filter will be incremented by 1,

+ * + *

Note: If the other filter is a counting Bloom filter the index counts are ignored and it is treated as an + * IndexProducer.

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param other the other Bloom filter + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #add(BitCountProducer) + */ + default boolean merge(final BloomFilter other) { + Objects.requireNonNull(other, "other"); + return merge((IndexProducer) other); + } + + /** + * Merges the specified Hasher into this Bloom filter. + * + *

Specifically: all counts for the unique indexes identified by the {@code hasher} will be incremented by 1,

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param hasher the hasher + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #add(BitCountProducer) + */ + default boolean merge(final Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + return merge(hasher.uniqueIndices(getShape())); + } + + /** + * Merges the specified index producer into this Bloom filter. + * + *

Specifically: all counts for the indexes identified by the {@code indexProducer} will be incremented by 1,

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + *

Note: Indices that are returned multiple times will be incremented multiple times.

+ * + * @param indexProducer the IndexProducer + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #add(BitCountProducer) + */ + default boolean merge(final IndexProducer indexProducer) { + Objects.requireNonNull(indexProducer, "indexProducer"); + try { + return add(BitCountProducer.from(indexProducer)); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits()), e); + } + } + + /** + * Merges the specified BitMap producer into this Bloom filter. + * + *

Specifically: all counts for the indexes identified by the {@code bitMapProducer} will be incremented by 1,

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param bitMapProducer the BitMapProducer + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #add(BitCountProducer) + */ + default boolean merge(final BitMapProducer bitMapProducer) { + Objects.requireNonNull(bitMapProducer, "bitMapProducer"); + return merge(IndexProducer.fromBitMapProducer(bitMapProducer)); + } + /** * Removes the specified Bloom filter from this Bloom filter. * @@ -92,12 +172,15 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @see #isValid() * @see #subtract(BitCountProducer) */ - boolean remove(BloomFilter other); + default boolean remove(final BloomFilter other) { + Objects.requireNonNull(other, "other"); + return remove((IndexProducer) other); + } /** - * Removes the specified hasher from the Bloom filter from this Bloom filter. + * Removes the unique values from the specified hasher from this Bloom filter. * - *

Specifically all counts for the indices produced by the {@code hasher} will be + *

Specifically all counts for the unique indices produced by the {@code hasher} will be * decremented by 1.

* *

For HasherCollections each enclosed Hasher will be considered a single item and decremented @@ -110,7 +193,53 @@ public interface CountingBloomFilter extends BloomFilter, BitCountProducer { * @see #isValid() * @see #subtract(BitCountProducer) */ - boolean remove(Hasher hasher); + default boolean remove(final Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + return remove(hasher.uniqueIndices(getShape())); + } + + /** + * Removes the values from the specified IndexProducer from the Bloom filter from this Bloom filter. + * + *

Specifically all counts for the unique indices produced by the {@code hasher} will be + * decremented by 1.

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + *

Node: This method expects index producers that produce unique values.

+ * + * @param indexProducer the IndexProducer to provide the indexes + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #subtract(BitCountProducer) + */ + default boolean remove(final IndexProducer indexProducer) { + Objects.requireNonNull(indexProducer, "indexProducer"); + try { + return subtract(BitCountProducer.from(indexProducer)); + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", getShape().getNumberOfBits())); + } + } + + /** + * Removes the specified BitMapProducer from this Bloom filter. + * + *

Specifically all counts for the indices produced by the {@code bitMapProducer} will be + * decremented by 1.

+ * + *

This method will return {@code true} if the filter is valid after the operation.

+ * + * @param bitMapProducer the BitMapProducer to provide the indexes + * @return {@code true} if the removal was successful and the state is valid + * @see #isValid() + * @see #subtract(BitCountProducer) + */ + default boolean remove(final BitMapProducer bitMapProducer) { + Objects.requireNonNull(bitMapProducer, "bitMapProducer"); + return remove(IndexProducer.fromBitMapProducer(bitMapProducer)); + } /** * Adds the specified BitCountProducer to this Bloom filter. diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java index eb777f726..df5ee707b 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java @@ -55,58 +55,6 @@ public final class SimpleBloomFilter implements BloomFilter { this.cardinality = 0; } - /** - * Creates an instance that is equivalent to {@code other}. - * - * @param other The bloom filter to copy. - */ - public SimpleBloomFilter(BloomFilter other) { - Objects.requireNonNull(other, "other"); - this.shape = other.getShape(); - this.bitMap = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; - this.cardinality = 0; - if ((other.characteristics() & SPARSE) != 0) { - merge((IndexProducer) other); - } else { - merge((BitMapProducer) other); - } - } - - /** - * Creates a populated instance. - * @param shape The shape for the filter. - * @param hasher the Hasher to initialize the filter with. - */ - public SimpleBloomFilter(final Shape shape, Hasher hasher) { - this(shape); - Objects.requireNonNull(hasher, "hasher"); - merge(hasher); - } - - /** - * Creates a populated instance. - * @param shape The shape for the filter. - * @param indices the IndexProducer to initialize the filter with. - * @throws IllegalArgumentException if producer sends illegal value. - */ - public SimpleBloomFilter(final Shape shape, IndexProducer indices) { - this(shape); - Objects.requireNonNull(indices, "indices"); - merge(indices); - } - - /** - * Creates a populated instance. - * @param shape The shape for the filter. - * @param bitMaps the BitMapProducer to initialize the filter with. - * @throws IllegalArgumentException if the producer returns too many or too few bit maps. - */ - public SimpleBloomFilter(final Shape shape, BitMapProducer bitMaps) { - this(shape); - Objects.requireNonNull(bitMaps, "bitMaps"); - merge(bitMaps); - } - /** * Copy constructor for {@code copy()} use. * @param source @@ -139,29 +87,24 @@ public final class SimpleBloomFilter implements BloomFilter { return new SimpleBloomFilter(this); } - /** - * Performs a merge using an IndexProducer. - * @param indexProducer the IndexProducer to merge from. - * @throws IllegalArgumentException if producer sends illegal value. - */ - private void merge(IndexProducer indexProducer) { + @Override + public boolean merge(IndexProducer indexProducer) { + Objects.requireNonNull(indexProducer, "indexProducer"); indexProducer.forEachIndex(idx -> { if (idx < 0 || idx >= shape.getNumberOfBits()) { throw new IllegalArgumentException(String.format( - "IndexProducer should only send values in the range[0,%s]", shape.getNumberOfBits() - 1)); + "IndexProducer should only send values in the range[0,%s)", shape.getNumberOfBits())); } BitMap.set(bitMap, idx); return true; }); cardinality = -1; + return true; } - /** - * Performs a merge using an BitMapProducer. - * @param bitMapProducer the BitMapProducer to merge from. - * @throws IllegalArgumentException if producer sends illegal value. - */ - private void merge(BitMapProducer bitMapProducer) { + @Override + public boolean merge(BitMapProducer bitMapProducer) { + Objects.requireNonNull(bitMapProducer, "bitMapProducer"); try { int[] idx = new int[1]; bitMapProducer.forEachBitMap(value -> { @@ -173,7 +116,7 @@ public final class SimpleBloomFilter implements BloomFilter { int idxLimit = BitMap.getLongIndex(shape.getNumberOfBits()); if (idxLimit < idx[0]) { throw new IllegalArgumentException(String.format( - "BitMapProducer set a bit higher than the limit for the shape: %s", shape.getNumberOfBits())); + "BitMapProducer set a bit higher than the limit for the shape: %s", shape.getNumberOfBits() - 1)); } if (idxLimit == idx[0]) { long excess = (bitMap[idxLimit] >> shape.getNumberOfBits()); @@ -188,13 +131,13 @@ public final class SimpleBloomFilter implements BloomFilter { throw new IllegalArgumentException( String.format("BitMapProducer should send at most %s maps", bitMap.length), e); } + return true; } @Override public boolean merge(Hasher hasher) { Objects.requireNonNull(hasher, "hasher"); - merge(hasher.indices(shape)); - return true; + return merge(hasher.indices(shape)); } @Override diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java index 9359e6a39..fdea9b469 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java @@ -49,68 +49,6 @@ public final class SparseBloomFilter implements BloomFilter { this.indices = new TreeSet<>(); } - /** - * Creates an instance that is equivalent to {@code other}. - * - * @param other The bloom filter to copy. - */ - public SparseBloomFilter(BloomFilter other) { - Objects.requireNonNull(other, "other"); - this.shape = other.getShape(); - this.indices = new TreeSet<>(); - if ((other.characteristics() & SPARSE) != 0) { - merge((IndexProducer) other); - } else { - merge(IndexProducer.fromBitMapProducer(other)); - } - } - - private void checkIndices(Shape shape) { - if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { - throw new IllegalArgumentException( - String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); - } - } - - /** - * Constructs a populated Bloom filter. - * @param shape the shape for the bloom filter. - * @param hasher the hasher to provide the initial data. - */ - public SparseBloomFilter(final Shape shape, Hasher hasher) { - this(shape); - Objects.requireNonNull(hasher, "hasher"); - hasher.indices(shape).forEachIndex(this::add); - checkIndices(shape); - } - - /** - * Constructs a populated Bloom filter. - * @param shape the shape of the filter. - * @param indices an index producer for the indices to to enable. - * @throws IllegalArgumentException if indices contains a value greater than the number - * of bits in the shape. - */ - public SparseBloomFilter(Shape shape, IndexProducer indices) { - this(shape); - Objects.requireNonNull(indices, "indices"); - indices.forEachIndex(this::add); - checkIndices(shape); - } - - /** - * Constructs a populated Bloom filter. - * @param shape the shape of the filter. - * @param bitMaps a BitMapProducer for the bit maps to add. - * @throws IllegalArgumentException if the bit maps contain a value greater than the number - * of bits in the shape. - */ - public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) { - this(shape); - Objects.requireNonNull(bitMaps, "bitMaps"); - merge(IndexProducer.fromBitMapProducer(bitMaps)); - } - private SparseBloomFilter(SparseBloomFilter source) { shape = source.shape; indices = new TreeSet<>(source.indices); @@ -140,23 +78,27 @@ public final class SparseBloomFilter implements BloomFilter { return true; } - /** - * Performs a merge using an IndexProducer. - * @param indexProducer the IndexProducer to merge from. - * @throws IllegalArgumentException if producer sends illegal value. - */ - private void merge(IndexProducer indexProducer) { + @Override + public boolean merge(IndexProducer indexProducer) { + Objects.requireNonNull(indexProducer, "indexProducer"); indexProducer.forEachIndex(this::add); if (!this.indices.isEmpty()) { if (this.indices.last() >= shape.getNumberOfBits()) { throw new IllegalArgumentException(String.format("Value in list %s is greater than maximum value (%s)", - this.indices.last(), shape.getNumberOfBits())); + this.indices.last(), shape.getNumberOfBits() - 1)); } if (this.indices.first() < 0) { throw new IllegalArgumentException( String.format("Value in list %s is less than 0", this.indices.first())); } } + return true; + } + + @Override + public boolean merge(BitMapProducer bitMapProducer) { + Objects.requireNonNull(bitMapProducer, "bitMapProducer"); + return this.merge(IndexProducer.fromBitMapProducer(bitMapProducer)); } @Override @@ -213,7 +155,7 @@ public final class SparseBloomFilter implements BloomFilter { * because our indices are always in order we can shorten the time necessary to * create the longs for the consumer */ - // the currenlty constructed bitMap + // the currently constructed bitMap long bitMap = 0; // the bitmap we are working on int idx = 0; diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java index c20725456..a4f5ce4f2 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java @@ -20,58 +20,57 @@ * *

Background:

* - *

The Bloom filter is a probabilistic data structure that indicates where things are not. - * Conceptually it is a bit vector. You create a Bloom filter by creating hashes - * and converting those to enabled bits in the vector. Multiple Bloom filters may be merged - * together into one Bloom filter. It is possible to test if a filter {@code B} has merged into - * another filter {@code A} by verifying that {@code (A & B) == B}.

+ *

The Bloom filter is a probabilistic data structure that indicates where things are not. Conceptually it is a bit + * vector. You create a Bloom filter by creating hashes and converting those to enabled bits in the vector. Multiple + * Bloom filters may be merged together into one Bloom filter. It is possible to test if a filter {@code B} has merged + * into another filter {@code A} by verifying that {@code (A & B) == B}.

* - *

Bloom filters are generally used where hash - * tables would be too large, or as a filter front end for longer processes. For example - * most browsers have a Bloom filter that is built from all known bad URLs (ones that - * serve up malware). When you enter a URL the browser builds a Bloom filter and checks to - * see if it is "in" the bad URL filter. If not the URL is good, if it matches, then the - * expensive lookup on a remote system is made to see if it actually is in the list. There - * are lots of other uses, and in most cases the reason is to perform a fast check as a - * gateway for a longer operation.

+ *

Bloom filters are generally used where hash tables would be too large, or as a filter front end for longer processes. + * For example most browsers have a Bloom filter that is built from all known bad URLs (ones that serve up malware). + * When you enter a URL the browser builds a Bloom filter and checks to see if it is "in" the bad URL filter. If not the + * URL is good, if it matches, then the expensive lookup on a remote system is made to see if it actually is in the + * list. There are lots of other uses, and in most cases the reason is to perform a fast check as a gateway for a longer + * operation.

* *

BloomFilter

* - *

The Bloom filter architecture here is designed so that the implementation of the storage of bits is abstracted. + *

The Bloom filter architecture here is designed for speed of execution, so some methods like {@code merge}, {@code remove}, + * {@code add}, and {@code subtract} may throw exceptions. Once an exception is thrown the state of the Bloom filter is unknown. + * The choice to use not use atomic transactions was made to achieve maximum performance under correct usage.

+ * + *

In addition the architecture is designed so that the implementation of the storage of bits is abstracted. * Programs that utilize the Bloom filters may use the {@code BitMapProducer} or {@code IndexProducer} to retrieve a - * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in + * representation of the internal structure. Additional methods are available in the {@code BitMap} to assist in * manipulation of the representations.

* - *

The bloom filter code is an interface that requires implementation of 6 methods:

+ *

The bloom filter code is an interface that requires implementation of 9 methods:

* * - *

Other methods should be implemented where they can be done so more efficiently than the default implementations. - *

+ *

Other methods should be implemented where they can be done so more efficiently than the default implementations.

* *

CountingBloomFilter

* *

The counting bloom filter extends the Bloom filter by counting the number of times a specific bit has been - * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional + * enabled or disabled. This allows the removal (opposite of merge) of Bloom filters at the expense of additional * overhead.

* *

Shape

@@ -80,22 +79,23 @@ * *

Hasher

* - *

A Hasher converts bytes into a series of integers based on a Shape. With the exception of the HasherCollecton, - * each hasher represents one item being added to the Bloom filter. The HasherCollection represents the - * number of items as the sum of the number of items represented by the Hashers in the collection.

+ *

A Hasher converts bytes into a series of integers based on a Shape. With the exception of the HasherCollecton, + * each hasher represents one item being added to the Bloom filter. The HasherCollection represents the number of + * items as the sum of the number of items represented by the Hashers in the collection.

* - *

The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily - * initialized by using a standard {@code MessageDigest} or other Hash function to hash the item to insert and - * then splitting the hash bytes in half and considering each as a long value.

+ *

The EnhancedDoubleHasher uses a combinatorial generation technique to create the integers. It is easily + * initialized by using a byte array returned by the standard {@code MessageDigest} or other hash function to + * initialize the Hasher. Alternatively a pair of a long values may also be used.

* - *

Other implementations of the Hasher are easy to implement, and should make use of the {@code Hasher.Filter} - * and/or {@code Hasher.FileredIntConsumer} classes to filter out duplicate indices.

+ *

Other implementations of the Hasher are easy to implement, and should make use of the {@code Hasher.Filter} + * and/or {@code Hasher.FileredIntConsumer} classes to filter out duplicate indices when implementing + * {@code Hasher.uniqueIndices(Shape)}.

* *

References

* *
    - *
  1. https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
  2. - *
  3. https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/BloomFilter.java#L60
  4. + *
  5. https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
  6. + *
  7. https://github.com/apache/cassandra/blob/trunk/src/java/org/apache/cassandra/utils/BloomFilter.java#L60
  8. *
* * @since 4.5 diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java index da2aa897c..91eb763af 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractBloomFilterTest.java @@ -16,6 +16,7 @@ */ package org.apache.commons.collections4.bloomfilter; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -24,6 +25,8 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.List; +import java.util.SortedSet; + import org.junit.jupiter.api.Test; /** @@ -70,7 +73,11 @@ public abstract class AbstractBloomFilterTest { * @param hasher the hasher to use to create the filter. * @return a BloomFilter implementation. */ - protected abstract T createFilter(Shape shape, Hasher hasher); + protected final T createFilter(Shape shape, Hasher hasher) { + T bf = createEmptyFilter(shape); + bf.merge(hasher); + return bf; + } /** * Create the BloomFilter implementation we are testing. @@ -79,7 +86,11 @@ public abstract class AbstractBloomFilterTest { * @param producer A BitMap producer to build the filter with. * @return a BloomFilter implementation. */ - protected abstract T createFilter(Shape shape, BitMapProducer producer); + protected final T createFilter(Shape shape, BitMapProducer producer) { + T bf = createEmptyFilter(shape); + bf.merge(producer); + return bf; + } /** * Create the BloomFilter implementation we are testing. @@ -88,57 +99,85 @@ public abstract class AbstractBloomFilterTest { * @param producer An Index producer to build the filter with. * @return a BloomFilter implementation. */ - protected abstract T createFilter(Shape shape, IndexProducer producer); + protected final T createFilter(Shape shape, IndexProducer producer) { + T bf = createEmptyFilter(shape); + bf.merge(producer); + return bf; + } /** * */ @Test - public void testConstructWithBadHasher() { + public void testMergeWithBadHasher() { // value too large + final BloomFilter f = createEmptyFilter(getTestShape()); assertThrows(IllegalArgumentException.class, - () -> createFilter(getTestShape(), new BadHasher(getTestShape().getNumberOfBits()))); + () -> f.merge(new BadHasher(getTestShape().getNumberOfBits()))); // negative value - assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), new BadHasher(-1))); + BloomFilter f2 = createEmptyFilter(getTestShape()); + assertThrows(IllegalArgumentException.class, () -> f2.merge(new BadHasher(-1))); } @Test - public void testConstructWitBitMapProducer() { - long[] values = { from11Value, 0x9L }; - BloomFilter f = createFilter(getTestShape(), BitMapProducer.fromBitMapArray(values)); - List lst = new ArrayList<>(); - for (long l : values) { - lst.add(l); + public void testMergeWithHasher() { + for (int i=0; i<5000; i++) { + final BloomFilter f = createEmptyFilter(getTestShape()); + int[] expected = DefaultIndexProducerTest.generateIntArray(getTestShape().getNumberOfHashFunctions(), getTestShape().getNumberOfBits()); + Hasher hasher = new ArrayHasher(expected); + f.merge(hasher); + // create sorted unique array of expected values + assertArrayEquals(DefaultIndexProducerTest.unique(expected), f.asIndexArray( )); } - assertTrue(f.forEachBitMap(l -> { - return lst.remove(Long.valueOf(l)); - })); - assertTrue(lst.isEmpty()); + } - BitMapProducer badProducer = BitMapProducer.fromBitMapArray(0L, Long.MAX_VALUE); + @Test + public void testMergeWithBitMapProducer() { + for (int i=0; i<5000; i++) { + long[] values = new long[2]; + for (int idx : DefaultIndexProducerTest.generateIntArray(getTestShape().getNumberOfHashFunctions(), getTestShape().getNumberOfBits())) { + BitMap.set(values, idx); + } + BloomFilter f = createFilter(getTestShape(), BitMapProducer.fromBitMapArray(values)); + List lst = new ArrayList<>(); + for (long l : values) { + lst.add(l); + } + assertTrue(f.forEachBitMap(l -> { + return lst.remove(Long.valueOf(l)); + })); + assertTrue(lst.isEmpty()); + } // values too large - assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), badProducer)); + final BitMapProducer badProducer = BitMapProducer.fromBitMapArray(0L, Long.MAX_VALUE); + final BloomFilter bf = createEmptyFilter(getTestShape()); + assertThrows(IllegalArgumentException.class, () -> bf.merge(badProducer)); + + // test where merged bits exceed expected bits but both bitmaps are the same length. + final BitMapProducer badProducer2 = BitMapProducer.fromBitMapArray(0x80_00_00_00_00_00_00_00L); + final BloomFilter bf2 = createEmptyFilter(Shape.fromKM(3, 32)); + assertThrows(IllegalArgumentException.class, () -> bf2.merge(badProducer2)); } @Test - public void testConstructWithIndexProducer() { - int[] values = new int[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 }; - BloomFilter f = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); - List lst = new ArrayList<>(); - for (int i : values) { - lst.add(i); + public void testMergeWithIndexProducer() { + for (int i=0; i<5000; i++) { + int[] values = DefaultIndexProducerTest.generateIntArray(getTestShape().getNumberOfHashFunctions(), getTestShape().getNumberOfBits()); + BloomFilter f = createFilter(getTestShape(), IndexProducer.fromIndexArray(values)); + SortedSet uniqueValues = DefaultIndexProducerTest.uniqueSet(values); + assertTrue(f.forEachIndex(idx -> { + return uniqueValues.remove(Integer.valueOf(idx)); + })); + assertTrue(uniqueValues.isEmpty()); } - assertTrue(f.forEachIndex(i -> { - return lst.remove(Integer.valueOf(i)); - })); - assertTrue(lst.isEmpty()); - // value to large - assertThrows(IllegalArgumentException.class, () -> createFilter(getTestShape(), - IndexProducer.fromIndexArray(new int[] { getTestShape().getNumberOfBits() }))); - // negative value + final BloomFilter f1 = createEmptyFilter(getTestShape()); assertThrows(IllegalArgumentException.class, - () -> createFilter(getTestShape(), IndexProducer.fromIndexArray(new int[] { -1 }))); + () -> f1.merge(IndexProducer.fromIndexArray(new int[] { getTestShape().getNumberOfBits() }))); + // negative value + final BloomFilter f2 = createEmptyFilter(getTestShape()); + assertThrows(IllegalArgumentException.class, + () -> f2.merge(IndexProducer.fromIndexArray(new int[] { -1 }))); } @Test @@ -228,7 +267,7 @@ public abstract class AbstractBloomFilterTest { @Test public final void testEstimateN() { // build a filter - BloomFilter filter1 = new SimpleBloomFilter(getTestShape(), from1); + BloomFilter filter1 = createFilter(getTestShape(), from1); assertEquals(1, filter1.estimateN()); // the data provided above do not generate an estimate that is equivalent to the @@ -316,20 +355,20 @@ public abstract class AbstractBloomFilterTest { assertThrows(IllegalArgumentException.class, () -> bf1.merge(new BadHasher(-1))); // test error when bloom filter returns values out of range - final BloomFilter bf5 = new SimpleBloomFilter( - Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), - new IncrementingHasher(Long.SIZE * 2, 1)); + BloomFilter bf5 = new SimpleBloomFilter( + Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE)); + bf5.merge(new IncrementingHasher(Long.SIZE * 2, 1)); assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf5)); - final BloomFilter bf6 = new SparseBloomFilter( - Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE), - new IncrementingHasher(Long.SIZE * 2, 1)); + BloomFilter bf6 = new SparseBloomFilter( + Shape.fromKM(getTestShape().getNumberOfHashFunctions(), 3 * Long.SIZE)); + bf6.merge(new IncrementingHasher(Long.SIZE * 2, 1)); assertThrows(IllegalArgumentException.class, () -> bf1.merge(bf6)); } - private static void assertIndexProducerConstructor(Shape shape, int[] values, int[] expected) { + private void assertIndexProducerMerge(Shape shape, int[] values, int[] expected) { IndexProducer indices = IndexProducer.fromIndexArray(values); - SparseBloomFilter filter = new SparseBloomFilter(shape, indices); + BloomFilter filter = createFilter(shape, indices); List lst = new ArrayList<>(); filter.forEachIndex(x -> { lst.add(x); @@ -347,18 +386,18 @@ public abstract class AbstractBloomFilterTest { } @Test - public void testIndexProducerConstructor() { + public void testIndexProducerMerge() { Shape shape = Shape.fromKM(5, 10); - assertIndexProducerConstructor(shape, new int[] { 0, 2, 4, 6, 8 }, new int[] { 0, 2, 4, 6, 8 }); + assertIndexProducerMerge(shape, new int[] { 0, 2, 4, 6, 8 }, new int[] { 0, 2, 4, 6, 8 }); // test duplicate values - assertIndexProducerConstructor(shape, new int[] { 0, 2, 4, 2, 8 }, new int[] { 0, 2, 4, 8 }); + assertIndexProducerMerge(shape, new int[] { 0, 2, 4, 2, 8 }, new int[] { 0, 2, 4, 8 }); // test negative values assertFailedIndexProducerConstructor(shape, new int[] { 0, 2, 4, -2, 8 }); // test index too large assertFailedIndexProducerConstructor(shape, new int[] { 0, 2, 4, 12, 8 }); // test no indices - assertIndexProducerConstructor(shape, new int[0], new int[0]); + assertIndexProducerMerge(shape, new int[0], new int[0]); } @Test diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java index b7ca7dd37..48fbd92f4 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractCountingBloomFilterTest.java @@ -17,6 +17,7 @@ package org.apache.commons.collections4.bloomfilter; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -94,7 +95,8 @@ public abstract class AbstractCountingBloomFilterTest bf6.remove(ip2)); + + final CountingBloomFilter bf7 = createFilter(getTestShape(), from1); + final BitMapProducer bmp2 = BitMapProducer.fromIndexProducer(ip2, getTestShape().getNumberOfBits()); + assertThrows( IllegalArgumentException.class, () -> bf7.remove(bmp2)); } @Test @@ -247,20 +283,12 @@ public abstract class AbstractCountingBloomFilterTest { - assertEquals(1, y, "Hasher in mergeInPlace results in value not equal to 1"); - return true; - }); - - bf1 = createEmptyFilter(shape); - CountingBloomFilter bf2 = bf1.copy(); - bf2.merge(hasher); - assertEquals(6, bf2.cardinality()); - bf2.forEachCount((x, y) -> { assertEquals(1, y, "Hasher in merge results in value not equal to 1"); return true; }); - bf1 = createFilter(shape, hasher); + bf1 = createEmptyFilter(shape); + bf1.merge(hasher); bf1.remove(hasher); assertEquals(0, bf1.cardinality()); assertTrue(bf1.forEachCount((x, y) -> (false)), "Hasher in removes results in value not equal to 0"); diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java index cd5eda18c..0e9fae410 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/AbstractHasherTest.java @@ -85,7 +85,7 @@ public abstract class AbstractHasherTest extends AbstractIndexProducerTest { count[0]++; return false; }); - assertEquals(1, count[0], "did not exit early" ); + assertEquals(1, count[0], "did not exit early"); } @Test @@ -97,8 +97,8 @@ public abstract class AbstractHasherTest extends AbstractIndexProducerTest { List full = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList()); producer = hasher.uniqueIndices(shape); List unique = Arrays.stream(producer.asIndexArray()).boxed().collect(Collectors.toList()); - assertTrue( full.size() > unique.size() ); - Set set = new HashSet( unique ); - assertEquals( set.size(), unique.size() ); + assertTrue(full.size() > unique.size()); + Set set = new HashSet<>(unique); + assertEquals(set.size(), unique.size()); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java index 86bd638b7..e7141dade 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilterTest.java @@ -25,28 +25,4 @@ public class ArrayCountingBloomFilterTest extends AbstractCountingBloomFilterTes protected ArrayCountingBloomFilter createEmptyFilter(Shape shape) { return new ArrayCountingBloomFilter(shape); } - - @Override - protected ArrayCountingBloomFilter createFilter(Shape shape, Hasher hasher) { - return createFilter(shape, hasher.uniqueIndices(shape)); - } - - @Override - protected ArrayCountingBloomFilter createFilter(Shape shape, BitMapProducer producer) { - return createFilter(shape, IndexProducer.fromBitMapProducer(producer)); - } - - @Override - protected ArrayCountingBloomFilter createFilter(Shape shape, IndexProducer producer) { - ArrayCountingBloomFilter filter = createEmptyFilter(shape); - try { - filter.add(BitCountProducer.from(producer)); - return filter; - } catch (ArrayIndexOutOfBoundsException e) { - // since ArrayCountingBloomFilter does not ahave a constructor that takes a - // hasher - // we have to duplicate the expected results here. - throw new IllegalArgumentException(e); - } - } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java new file mode 100644 index 000000000..babd13d2c --- /dev/null +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/ArrayHasher.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntPredicate; + +/** + * A Testing Hasher that returns the array values % shape.getNumberOfBits(). + * + * @since 4.5 + */ +public final class ArrayHasher implements Hasher { + final int[] values; + + ArrayHasher(final int... values) { + this.values = values; + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull(shape, "shape"); + return new Producer(shape); + } + + @Override + public IndexProducer uniqueIndices(Shape shape) { + return new Producer(shape); + } + + private class Producer implements IndexProducer { + Shape shape; + + Producer(Shape shape) { + this.shape = shape; + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + int pos = 0; + for (int i=0; i(); } - AbstractDefaultBloomFilter(Shape shape, Hasher hasher) { - this(shape, hasher.indices(shape)); - } - - AbstractDefaultBloomFilter(Shape shape, BitMapProducer producer) { - this(shape, IndexProducer.fromBitMapProducer(producer)); - } - - AbstractDefaultBloomFilter(Shape shape, IndexProducer producer) { - this(shape); - producer.forEachIndex((i) -> { - indices.add(i); - return true; - }); - if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { - throw new IllegalArgumentException( - String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); - } - } - @Override public void clear() { indices.clear(); @@ -147,12 +112,7 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest { - indices.add(i); - return true; - }); + private void checkIndicesRange() { if (!indices.isEmpty()) { if (indices.last() >= shape.getNumberOfBits()) { throw new IllegalArgumentException(String.format("Value in list %s is greater than maximum value (%s)", @@ -163,7 +123,21 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest { + indices.add(x); + return true; + }); + checkIndicesRange(); + return result; + } + + @Override + public boolean merge(BitMapProducer bitMapProducer){ + return merge(IndexProducer.fromBitMapProducer(bitMapProducer)); } @Override @@ -174,18 +148,6 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest uniqueSet(int[] ary) { + SortedSet uniq = new TreeSet(); + for (int idx : ary) { + uniq.add(idx); + } + return uniq; + } + + /** + * Creates a sorted unique array of ints. + * @param ary the array to sort and make unique + * @return the sorted unique array. + */ + public static int[] unique(int[] ary) { + Set uniq = uniqueSet(ary); + int[] result = new int[uniq.size()]; + int i=0; + for (int idx : uniq) { + result[i++] = idx; + } + return result; + } + + @Test + public void testFromBitMapProducer() { + for (int i=0; i<5000; i++) { + int[] expected = generateIntArray( 7, 256 ); + long[] bits = new long[BitMap.numberOfBitMaps(256)]; + for (int bitIndex : expected) { + BitMap.set(bits, bitIndex); + } + IndexProducer ip = IndexProducer.fromBitMapProducer(BitMapProducer.fromBitMapArray(bits)); + assertArrayEquals(unique(expected), ip.asIndexArray()); + } + } + + @Test + public void testFromIndexArray() { + for (int i=0; i<5000; i++) { + int[] expected = generateIntArray(10, 256); + IndexProducer ip = IndexProducer.fromIndexArray(expected); + assertArrayEquals(unique(expected), ip.asIndexArray()); + } + } +} diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java index d62ac959e..40fded6dd 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSimpleBloomFilterTest.java @@ -23,11 +23,13 @@ public class IndexProducerFromSimpleBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { Hasher hasher = new IncrementingHasher(0, 1); - return new SparseBloomFilter(shape, hasher); + BloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(hasher); + return bf; } @Override protected IndexProducer createEmptyProducer() { - return new SparseBloomFilter(shape); + return new SimpleBloomFilter(shape); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java index b51e01b40..bd1e34836 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/IndexProducerFromSparseBloomFilterTest.java @@ -23,11 +23,14 @@ public class IndexProducerFromSparseBloomFilterTest extends AbstractIndexProduce @Override protected IndexProducer createProducer() { Hasher hasher = new IncrementingHasher(0, 1); - return new SimpleBloomFilter(shape, hasher); + BloomFilter bf = new SparseBloomFilter(shape); + bf.merge(hasher); + return bf; + } @Override protected IndexProducer createEmptyProducer() { - return new SimpleBloomFilter(shape); + return new SparseBloomFilter(shape); } } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java index 890e22f8f..98a62604f 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java @@ -49,22 +49,34 @@ public class SetOperationsTest { assertEquals(expected, operation.applyAsDouble(filter2, filter1), "op(filter2, filter1)"); } + private BloomFilter createFilter(Shape shape, Hasher hasher) { + BloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(hasher); + return bf; + } + + private BloomFilter createFilter(Shape shape, IndexProducer producer) { + BloomFilter bf = new SparseBloomFilter(shape); + bf.merge(producer); + return bf; + } + /** * Tests that the Cosine similarity is correctly calculated. */ @Test public final void testCosineDistance() { - BloomFilter filter1 = new SimpleBloomFilter(shape, from1); - BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + BloomFilter filter1 = createFilter(shape, from1); + BloomFilter filter2 = createFilter(shape, from1); // identical filters should have no distance. double expected = 0; assertSymmetricOperation(expected, SetOperations::cosineDistance, filter1, filter2); Shape shape2 = Shape.fromKM(2, 72); - filter1 = new SimpleBloomFilter(shape2, from1); - filter2 = new SimpleBloomFilter(shape2, new IncrementingHasher(2, 1)); + filter1 = createFilter(shape2, from1); + filter2 = createFilter(shape2, new IncrementingHasher(2, 1)); int dotProduct = /* [1,2] & [2,3] = [2] = */ 1; int cardinalityA = 2; @@ -72,8 +84,8 @@ public class SetOperationsTest { expected = 1 - (dotProduct / Math.sqrt(cardinalityA * cardinalityB)); assertSymmetricOperation(expected, SetOperations::cosineDistance, filter1, filter2); - filter1 = new SimpleBloomFilter(shape, from1); - filter2 = new SimpleBloomFilter(shape, from11); + filter1 = createFilter(shape, from1); + filter2 = createFilter(shape, from11); dotProduct = /* [1..17] & [11..27] = [] = */ 7; cardinalityA = 17; cardinalityB = 17; @@ -81,20 +93,19 @@ public class SetOperationsTest { assertSymmetricOperation(expected, SetOperations::cosineDistance, filter1, filter2); // test with no values - filter1 = new SimpleBloomFilter(shape, from1); + filter1 = createFilter(shape, from1); filter2 = new SimpleBloomFilter(shape); - BloomFilter filter3 = new SimpleBloomFilter(shape); dotProduct = /* [1,2] & [] = [] = */ 0; cardinalityA = 2; cardinalityB = 0; - expected = /* 1 - (dotProduct/Math.sqrt( cardinalityA * cardinalityB )) = */ 1.0; + expected = /* 1 - (dotProduct/Math.sqrt(cardinalityA * cardinalityB)) = */ 1.0; assertSymmetricOperation(expected, SetOperations::cosineDistance, filter1, filter2); dotProduct = /* [] & [] = [] = */ 0; cardinalityA = 0; cardinalityB = 0; - expected = /* 1 - (dotProduct/Math.sqrt( cardinalityA * cardinalityB )) = */ 1.0; + expected = /* 1 - (dotProduct/Math.sqrt(cardinalityA * cardinalityB)) = */ 1.0; assertSymmetricOperation(expected, SetOperations::cosineDistance, filter1, filter2); } @@ -103,27 +114,27 @@ public class SetOperationsTest { */ @Test public final void testCosineSimilarity() { - BloomFilter filter1 = new SimpleBloomFilter(shape, from1); - BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + BloomFilter filter1 = createFilter(shape, from1); + BloomFilter filter2 = createFilter(shape, from1); int dotProduct = /* [1..17] & [1..17] = [1..17] = */ 17; int cardinalityA = 17; int cardinalityB = 17; - double expected = /* dotProduct/Sqrt( cardinalityA * cardinalityB ) = */ 1.0; + double expected = /* dotProduct/Sqrt(cardinalityA * cardinalityB) = */ 1.0; assertSymmetricOperation(expected, SetOperations::cosineSimilarity, filter1, filter2); dotProduct = /* [1..17] & [11..27] = [11..17] = */ 7; cardinalityA = 17; cardinalityB = 17; expected = dotProduct / Math.sqrt(cardinalityA * cardinalityB); - filter2 = new SimpleBloomFilter(shape, from11); + filter2 = createFilter(shape, from11); assertSymmetricOperation(expected, SetOperations::cosineSimilarity, filter1, filter2); // test no values filter1 = new SimpleBloomFilter(shape); filter2 = new SimpleBloomFilter(shape); // build a filter - BloomFilter filter3 = new SimpleBloomFilter(shape, from1); + BloomFilter filter3 = createFilter(shape, from1); assertSymmetricOperation(0.0, SetOperations::cosineSimilarity, filter1, filter2); assertSymmetricOperation(0.0, SetOperations::cosineSimilarity, filter1, filter3); } @@ -133,13 +144,13 @@ public class SetOperationsTest { */ @Test public final void testHammingDistance() { - final BloomFilter filter1 = new SimpleBloomFilter(shape, from1); - BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + final BloomFilter filter1 = createFilter(shape, from1); + BloomFilter filter2 = createFilter(shape, from1); int hammingDistance = /* [1..17] ^ [1..17] = [] = */ 0; assertSymmetricOperation(hammingDistance, SetOperations::hammingDistance, filter1, filter2); - filter2 = new SimpleBloomFilter(shape, from11); + filter2 = createFilter(shape, from11); hammingDistance = /* [1..17] ^ [11..27] = [1..10][17-27] = */ 20; assertSymmetricOperation(hammingDistance, SetOperations::hammingDistance, filter1, filter2); } @@ -149,13 +160,13 @@ public class SetOperationsTest { */ @Test public final void testJaccardDistance() { - BloomFilter filter1 = new SimpleBloomFilter(shape, from1); - BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + BloomFilter filter1 = createFilter(shape, from1); + BloomFilter filter2 = createFilter(shape, from1); // 1 - jaccardSimilarity -- see jaccardSimilarityTest assertSymmetricOperation(0.0, SetOperations::jaccardDistance, filter1, filter2); - filter2 = new SimpleBloomFilter(shape, from11); + filter2 = createFilter(shape, from11); double intersection = /* [1..17] & [11..27] = [11..17] = */ 7.0; int union = /* [1..17] | [11..27] = [1..27] = */ 27; double expected = 1 - (intersection / union); @@ -164,7 +175,7 @@ public class SetOperationsTest { // test no values filter1 = new SimpleBloomFilter(shape); filter2 = new SimpleBloomFilter(shape); - BloomFilter filter3 = new SimpleBloomFilter(shape, from1); + BloomFilter filter3 = createFilter(shape, from1); // 1 - jaccardSimilarity -- see jaccardSimilarityTest assertSymmetricOperation(1.0, SetOperations::jaccardDistance, filter1, filter2); @@ -176,15 +187,15 @@ public class SetOperationsTest { */ @Test public final void testJaccardSimilarity() { - BloomFilter filter1 = new SimpleBloomFilter(shape, from1); - BloomFilter filter2 = new SimpleBloomFilter(shape, from1); + BloomFilter filter1 = createFilter(shape, from1); + BloomFilter filter2 = createFilter(shape, from1); double intersection = /* [1..17] & [1..17] = [1..17] = */ 17.0; int union = /* [1..17] | [1..17] = [1..17] = */ 17; double expected = intersection / union; assertSymmetricOperation(expected, SetOperations::jaccardSimilarity, filter1, filter2); - filter2 = new SimpleBloomFilter(shape, from11); + filter2 = createFilter(shape, from11); intersection = /* [1..17] & [11..27] = [11..17] = */ 7.0; union = /* [1..17] | [11..27] = [1..27] = */ 27; expected = intersection / union; @@ -193,7 +204,6 @@ public class SetOperationsTest { // test no values filter1 = new SimpleBloomFilter(shape); filter2 = new SimpleBloomFilter(shape); - BloomFilter filter3 = new SimpleBloomFilter(shape, from1); assertSymmetricOperation(0.0, SetOperations::jaccardSimilarity, filter1, filter2); intersection = /* [] & [1..17] = [] = */ 0.0; @@ -205,16 +215,16 @@ public class SetOperationsTest { @Test public final void testOrCardinality() { Shape shape = Shape.fromKM(3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(5, SetOperations::orCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(5, SetOperations::orCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(4, SetOperations::orCardinality, filter1, filter2); } @@ -222,32 +232,32 @@ public class SetOperationsTest { public final void testOrCardinalityWithDifferentLengthFilters() { Shape shape = Shape.fromKM(3, 128); Shape shape2 = Shape.fromKM(3, 192); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(5, SetOperations::orCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(5, SetOperations::orCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(4, SetOperations::orCardinality, filter1, filter2); } @Test public final void testAndCardinality() { Shape shape = Shape.fromKM(3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(1, SetOperations::andCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(0, SetOperations::andCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(1, SetOperations::andCardinality, filter1, filter2); } @@ -255,37 +265,37 @@ public class SetOperationsTest { public final void testAndCardinalityWithDifferentLengthFilters() { Shape shape = Shape.fromKM(3, 128); Shape shape2 = Shape.fromKM(3, 192); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(1, SetOperations::andCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(0, SetOperations::andCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(1, SetOperations::andCardinality, filter1, filter2); } @Test public final void testXorCardinality() { Shape shape = Shape.fromKM(3, 128); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(4, SetOperations::xorCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(5, SetOperations::xorCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 64, 69 })); assertSymmetricOperation(3, SetOperations::xorCardinality, filter1, filter2); Shape bigShape = Shape.fromKM(3, 192); - filter1 = new SparseBloomFilter(bigShape, IndexProducer.fromIndexArray(new int[] { 1, 63, 185})); - filter2 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63, 69 })); + filter1 = createFilter(bigShape, IndexProducer.fromIndexArray(new int[] { 1, 63, 185})); + filter2 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63, 69 })); assertSymmetricOperation(4, SetOperations::xorCardinality, filter1, filter2); } @@ -294,16 +304,16 @@ public class SetOperationsTest { Shape shape = Shape.fromKM(3, 128); Shape shape2 = Shape.fromKM(3, 192); - SparseBloomFilter filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); - SparseBloomFilter filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + BloomFilter filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63, 64 })); + BloomFilter filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(4, SetOperations::xorCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 1, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(5, SetOperations::xorCardinality, filter1, filter2); - filter1 = new SparseBloomFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); - filter2 = new SparseBloomFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); + filter1 = createFilter(shape, IndexProducer.fromIndexArray(new int[] { 5, 63 })); + filter2 = createFilter(shape2, IndexProducer.fromIndexArray(new int[] { 5, 64, 169 })); assertSymmetricOperation(3, SetOperations::xorCardinality, filter1, filter2); } diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java index 15aeea62c..b37c0ffe8 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilterTest.java @@ -16,7 +16,6 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.junit.jupiter.api.Test; /** * Tests for the {@link SimpleBloomFilter}. @@ -26,63 +25,4 @@ public class SimpleBloomFilterTest extends AbstractBloomFilterTest x == y)); }