From e08f0be55b9e17d9e03e6f311b76044cb786cd7a Mon Sep 17 00:00:00 2001 From: Alex Herbert Date: Mon, 24 Feb 2020 22:42:20 +0000 Subject: [PATCH] Revert CountingBloomFilter to ignore counts from another filter. Partially removes changes made in commit: 6ad69bedd3436a75883e66bd130c77df884be98b The class requires a revision to handle add/subtract of another CountingBloomFilter. Restore the tests to check that Counting filters are merged as if another non-counting filter type. Remove the javadoc from the CountingBloomFilter methods that state it uses the counts when merge/remove are called with a CountingBloomFilter as this is not what the functionality currently performs. Fix the merge with a hasher to only increment the count by 1 even if the hasher contains duplicates. Add test to verify this works as documented. This matches the remove functionality which removes duplicates before subtraction. --- .../bloomfilter/CountingBloomFilter.java | 115 +++++------ .../bloomfilter/CountingBloomFilterTest.java | 178 +++++++++++++----- 2 files changed, 186 insertions(+), 107 deletions(-) diff --git a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java index d3df6c267..a55b0b5a2 100644 --- a/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java +++ b/src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java @@ -19,7 +19,6 @@ package org.apache.commons.collections4.bloomfilter; import java.util.AbstractMap; import java.util.BitSet; import java.util.HashSet; -import java.util.Iterator; import java.util.Map; import java.util.PrimitiveIterator.OfInt; import java.util.function.Consumer; @@ -151,115 +150,117 @@ public class CountingBloomFilter extends AbstractBloomFilter { } /** - * Merge this Bloom filter with the other creating a new filter. The counts for - * bits that are on in the other filter are incremented. + * {@inheritDoc} *

- * For each bit that is turned on in the other filter; if the other filter is - * also a CountingBloomFilter the count is added to this filter, otherwise the - * count is incremented by one. + * Note: If the other filter is also a CountingBloomFilter the counts are not used. *

* - * @param other the other filter. + * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE} */ @Override public void merge(final BloomFilter other) { verifyShape(other); if (other instanceof CountingBloomFilter) { - // TODO - This should use the entrySet and increment each key by the count - merge(((CountingBloomFilter) other).counts.keySet().iterator()); + // Only use the keys and not the counts + ((CountingBloomFilter) other).counts.keySet().forEach(this::addIndex); } else { - merge(BitSet.valueOf(other.getBits()).stream().iterator()); + BitSet.valueOf(other.getBits()).stream().forEach(this::addIndex); } } + /** + * {@inheritDoc} + * + * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE} + */ @Override public void merge(final Hasher hasher) { verifyHasher(hasher); - merge(hasher.getBits(getShape())); + toStream(hasher).forEach(this::addIndex); } /** - * Merges an iterator of set bits into this filter. - * @param iter the iterator of bits to set. - */ - private void merge(final Iterator iter) { - iter.forEachRemaining(idx -> { - final Integer val = counts.get(idx); - if (val == null) { - counts.put(idx, 1 ); - } else if (val == Integer.MAX_VALUE) { - throw new IllegalStateException("Overflow on index " + idx); - } else { - counts.put(idx, val + 1); - } - }); - } - - /** - * Decrements the counts for the bits that are on in the other BloomFilter from this - * one. + * Increments the count for the bit index. * + * @param idx the index. + * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE} + */ + private void addIndex(int idx) { + final Integer val = counts.get(idx); + if (val == null) { + counts.put(idx, 1); + } else if (val == Integer.MAX_VALUE) { + throw new IllegalStateException("Overflow on index " + idx); + } else { + counts.put(idx, val + 1); + } + } + + /** + * Removes the other Bloom filter from this one. *

* For each bit that is turned on in the other filter the count is decremented by 1. *

* * @param other the other filter. + * @throws IllegalStateException If the count for a decremented bit was already at zero */ public void remove(final BloomFilter other) { verifyShape(other); if (other instanceof CountingBloomFilter) { - // TODO - This should use the entrySet and decrement each key by the count - remove(((CountingBloomFilter) other).counts.keySet().stream()); + // Only use the keys and not the counts + ((CountingBloomFilter) other).counts.keySet().forEach(this::subtractIndex); } else { - remove(BitSet.valueOf(other.getBits()).stream().boxed()); + BitSet.valueOf(other.getBits()).stream().forEach(this::subtractIndex); } } /** * Decrements the counts for the bits that are on in the hasher from this * Bloom filter. - * *

- * For each bit that is turned on in the other filter the count is decremented by 1. + * For each bit that is identified by the hasher the count is decremented by 1. + * Duplicate bits are ignored. *

* * @param hasher the hasher to generate bits. + * @throws IllegalStateException If the count for a decremented bit was already at zero */ public void remove(final Hasher hasher) { verifyHasher(hasher); - final Set lst = new HashSet<>(); - hasher.getBits(getShape()).forEachRemaining((Consumer) lst::add); - remove(lst.stream()); + toStream(hasher).forEach(this::subtractIndex); } /** - * Decrements the counts for the bits specified in the Integer stream. + * Decrements the count for the bit index. * - * @param idxStream The stream of bit counts to decrement. + * @param idx the index. + * @throws IllegalStateException If the count for a decremented bit was already at zero */ - private void remove(final Stream idxStream) { - idxStream.forEach(idx -> { - // TODO - This doubles up removal of the index. - // The first part will not throw if the count decrements past zero. - // The second part will throw if the count decrements past zero. - final Integer val = counts.get(idx); - if (val != null) { - if (val - 1 == 0) { - counts.remove(idx); - } else { - counts.put(idx, val - 1); - } - } - if (val == null || val == 0) { - throw new IllegalStateException("Underflow on index " + idx); - } else if (val - 1 == 0) { + private void subtractIndex(int idx) { + final Integer val = counts.get(idx); + if (val != null) { + if (val == 1) { counts.remove(idx); } else { counts.put(idx, val - 1); } - }); + } else { + throw new IllegalStateException("Underflow on index " + idx); + } } + /** + * Convert the hasher to a stream. Duplicates indices are removed. + * + * @param hasher the hasher + * @return the stream + */ + private Stream toStream(Hasher hasher) { + final Set lst = new HashSet<>(); + hasher.getBits(getShape()).forEachRemaining((Consumer) lst::add); + return lst.stream(); + } @Override public String toString() { diff --git a/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java index a42f3f620..28220a75f 100644 --- a/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java +++ b/src/test/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilterTest.java @@ -24,12 +24,13 @@ import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.PrimitiveIterator.OfInt; +import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; import org.junit.Assert; -import org.junit.Ignore; import org.junit.Test; /** @@ -140,33 +141,35 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest { * Tests that merge correctly updates the counts when a CountingBloomFilter is passed. */ @Test - @Ignore public void mergeTest_Counts() { - final int[] c1 = {1, 10, 100, 0}; - final int[] c2 = {4, 0, 7, 2}; + final int[] expected = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0 + }; + final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + final Hasher hasher = new StaticHasher(lst.iterator(), shape); - final Map map1 = new HashMap<>(); - for (int i = 0; i < c1.length; i++) { - map1.put(i, c1[i]); - } - final Map map2 = new HashMap<>(); - for (int i = 0; i < c2.length; i++) { - map2.put(i, c2[i]); - } + final CountingBloomFilter bf = createFilter(hasher, shape); - final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape); - final BloomFilter bf2 = new CountingBloomFilter(map2, shape); + final List lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27); + final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape); + final BloomFilter bf2 = createFilter(hasher2, shape); - bf1.merge(bf2); + bf.merge(bf2); - assertEquals(4, bf1.cardinality()); + assertEquals(27, bf.getCounts().count()); + assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get()); + assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get()); final Map m = new HashMap<>(); - bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); - - // This currently fails as the counts from the second filter are ignored. - for (int i = 0; i < c1.length; i++) { - assertEquals(c1[i] + c2[i], m.get(i).intValue()); + bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); + for (int i = 0; i < 29; i++) { + if (m.get(i) == null) { + assertEquals("Wrong value for " + i, expected[i], 0); + } else { + assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue()); + } } } @@ -238,7 +241,7 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest { } /** - * Test that merge correctly updates the counts when a Hasher is passed + * Test that merge correctly updates the counts when a Hasher is passed. */ @Test public void mergeTest_Shape_Hasher_Count() { @@ -273,42 +276,82 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest { } } + /** + * Test that merge correctly updates the counts when a Hasher is passed with duplicates. + */ + @Test + public void mergeTest_Shape_Hasher_Duplicates() { + final int[] expected = { + 0, 2, 1, 2, 1 + }; + + final List lst = Arrays.asList(1, 2, 3); + final Hasher hasher = new StaticHasher(lst.iterator(), shape); + + final CountingBloomFilter bf = createFilter(hasher, shape); + + final Hasher hasher2 = new Hasher() { + @Override + public OfInt getBits(Shape shape) { + // Deliberately return duplicates + return Arrays.asList(1, 1, 1, 1, 3, 3, 4).stream().mapToInt(Integer::intValue).iterator(); + } + + @Override + public HashFunctionIdentity getHashFunctionIdentity() { + return shape.getHashFunctionIdentity(); + } + + @Override + public boolean isEmpty() { + return false; + } + }; + + bf.merge(hasher2); + + assertEquals(4, bf.getCounts().count()); + + final Map m = new HashMap<>(); + bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); + for (int i = 0; i < expected.length; i++) { + if (m.get(i) == null) { + assertEquals("Wrong value for " + i, expected[i], 0); + } else { + assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue()); + } + } + } + /** * Tests that when removing a counting Bloom filter the counts are correctly updated. */ @Test - @Ignore public void removeTest_Counting() { - final int[] c1 = {1, 10, 100, 0}; - final int[] c2 = {1, 0, 7, 2}; - - // If left alone the test fails with an exception as there is underflow - // for (0 - 2). Fix this but the underflow behaviour is subject to change. - c2[c2.length - 1] = 0; - - final Map map1 = new HashMap<>(); - for (int i = 0; i < c1.length; i++) { - map1.put(i, c1[i]); + final int[] values = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + final Map map = new HashMap<>(); + for (int i = 1; i < values.length; i++) { + map.put(i, values[i]); } + + final CountingBloomFilter bf = new CountingBloomFilter(map, shape); + + final List lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17); + final Hasher hasher = new StaticHasher(lst.iterator(), shape); + final BloomFilter bf2 = new CountingBloomFilter(hasher, shape); + + bf.remove(bf2); + assertEquals(17, bf.cardinality()); final Map map2 = new HashMap<>(); - for (int i = 0; i < c2.length; i++) { - map2.put(i, c2[i]); - } + bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue())); - final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape); - final BloomFilter bf2 = new CountingBloomFilter(map2, shape); - - bf1.remove(bf2); - - assertEquals(2, bf1.cardinality()); - - final Map m = new HashMap<>(); - bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); - - // This currently fails as the counts from the second filter are ignored - // and only 1 is subtracted for each index. - for (int i = 0; i < c1.length; i++) { - assertEquals(Math.max(0, c1[i] - c2[i]), m.getOrDefault(i, 0).intValue()); + for (int i = 11; i < values.length; i++) { + assertNotNull(map2.get(i)); + assertEquals(1, map2.get(i).intValue()); } } @@ -343,6 +386,41 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest { } } + /** + * Tests that removing a hasher update the counts properly if there are duplicates. + */ + @Test + public void removeTest_Hasher_Duplicates() { + final int[] values = { + 0, 3, 2, 1 + }; + final int[] expected = { + 0, 2, 1, 0 + }; + final Map map = new HashMap<>(); + for (int i = 1; i < values.length; i++) { + map.put(i, values[i]); + } + + final CountingBloomFilter bf = new CountingBloomFilter(map, shape); + + final List lst = Arrays.asList(1, 1, 1, 1, 1, 1, 2, 3, 3, 3); + final Hasher hasher = new StaticHasher(lst.iterator(), shape); + + bf.remove(hasher); + assertEquals(2, bf.cardinality()); + final Map map2 = new HashMap<>(); + bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue())); + + for (int i = 0; i < expected.length; i++) { + if (map2.get(i) == null) { + assertEquals("Wrong value for " + i, expected[i], 0); + } else { + assertEquals("Wrong value for " + i, expected[i], map2.get(i).intValue()); + } + } + } + /** * Tests that when removing a standard Bloom filter the counts are correctly updated. */