Revert CountingBloomFilter to ignore counts from another filter.

Partially removes changes made in commit:
6ad69bedd3436a75883e66bd130c77df884be98b

The class requires a revision to handle add/subtract of another
CountingBloomFilter. Restore the tests to check that Counting filters
are merged as if another non-counting filter type.

Remove the javadoc from the CountingBloomFilter methods that state it
uses the counts when merge/remove are called with a CountingBloomFilter
as this is not what the functionality currently performs.

Fix the merge with a hasher to only increment the count by 1 even if the
hasher contains duplicates. Add test to verify this works as documented.
This matches the remove functionality which removes duplicates before
subtraction.
This commit is contained in:
Alex Herbert 2020-02-24 22:42:20 +00:00
parent ad04fff90d
commit e08f0be55b
2 changed files with 186 additions and 107 deletions

View File

@ -19,7 +19,6 @@ package org.apache.commons.collections4.bloomfilter;
import java.util.AbstractMap; import java.util.AbstractMap;
import java.util.BitSet; import java.util.BitSet;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.PrimitiveIterator.OfInt; import java.util.PrimitiveIterator.OfInt;
import java.util.function.Consumer; import java.util.function.Consumer;
@ -151,115 +150,117 @@ public class CountingBloomFilter extends AbstractBloomFilter {
} }
/** /**
* Merge this Bloom filter with the other creating a new filter. The counts for * {@inheritDoc}
* bits that are on in the other filter are incremented.
* <p> * <p>
* For each bit that is turned on in the other filter; if the other filter is * Note: If the other filter is also a CountingBloomFilter the counts are not used.
* also a CountingBloomFilter the count is added to this filter, otherwise the
* count is incremented by one.
* </p> * </p>
* *
* @param other the other filter. * @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
*/ */
@Override @Override
public void merge(final BloomFilter other) { public void merge(final BloomFilter other) {
verifyShape(other); verifyShape(other);
if (other instanceof CountingBloomFilter) { if (other instanceof CountingBloomFilter) {
// TODO - This should use the entrySet and increment each key by the count // Only use the keys and not the counts
merge(((CountingBloomFilter) other).counts.keySet().iterator()); ((CountingBloomFilter) other).counts.keySet().forEach(this::addIndex);
} else { } else {
merge(BitSet.valueOf(other.getBits()).stream().iterator()); BitSet.valueOf(other.getBits()).stream().forEach(this::addIndex);
} }
} }
/**
* {@inheritDoc}
*
* @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
*/
@Override @Override
public void merge(final Hasher hasher) { public void merge(final Hasher hasher) {
verifyHasher(hasher); verifyHasher(hasher);
merge(hasher.getBits(getShape())); toStream(hasher).forEach(this::addIndex);
} }
/** /**
* Merges an iterator of set bits into this filter. * Increments the count for the bit index.
* @param iter the iterator of bits to set.
*/
private void merge(final Iterator<Integer> iter) {
iter.forEachRemaining(idx -> {
final Integer val = counts.get(idx);
if (val == null) {
counts.put(idx, 1 );
} else if (val == Integer.MAX_VALUE) {
throw new IllegalStateException("Overflow on index " + idx);
} else {
counts.put(idx, val + 1);
}
});
}
/**
* Decrements the counts for the bits that are on in the other BloomFilter from this
* one.
* *
* @param idx the index.
* @throws IllegalStateException If the counts overflow {@link Integer#MAX_VALUE}
*/
private void addIndex(int idx) {
final Integer val = counts.get(idx);
if (val == null) {
counts.put(idx, 1);
} else if (val == Integer.MAX_VALUE) {
throw new IllegalStateException("Overflow on index " + idx);
} else {
counts.put(idx, val + 1);
}
}
/**
* Removes the other Bloom filter from this one.
* <p> * <p>
* For each bit that is turned on in the other filter the count is decremented by 1. * For each bit that is turned on in the other filter the count is decremented by 1.
* </p> * </p>
* *
* @param other the other filter. * @param other the other filter.
* @throws IllegalStateException If the count for a decremented bit was already at zero
*/ */
public void remove(final BloomFilter other) { public void remove(final BloomFilter other) {
verifyShape(other); verifyShape(other);
if (other instanceof CountingBloomFilter) { if (other instanceof CountingBloomFilter) {
// TODO - This should use the entrySet and decrement each key by the count // Only use the keys and not the counts
remove(((CountingBloomFilter) other).counts.keySet().stream()); ((CountingBloomFilter) other).counts.keySet().forEach(this::subtractIndex);
} else { } else {
remove(BitSet.valueOf(other.getBits()).stream().boxed()); BitSet.valueOf(other.getBits()).stream().forEach(this::subtractIndex);
} }
} }
/** /**
* Decrements the counts for the bits that are on in the hasher from this * Decrements the counts for the bits that are on in the hasher from this
* Bloom filter. * Bloom filter.
*
* <p> * <p>
* For each bit that is turned on in the other filter the count is decremented by 1. * For each bit that is identified by the hasher the count is decremented by 1.
* Duplicate bits are ignored.
* </p> * </p>
* *
* @param hasher the hasher to generate bits. * @param hasher the hasher to generate bits.
* @throws IllegalStateException If the count for a decremented bit was already at zero
*/ */
public void remove(final Hasher hasher) { public void remove(final Hasher hasher) {
verifyHasher(hasher); verifyHasher(hasher);
final Set<Integer> lst = new HashSet<>(); toStream(hasher).forEach(this::subtractIndex);
hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add);
remove(lst.stream());
} }
/** /**
* Decrements the counts for the bits specified in the Integer stream. * Decrements the count for the bit index.
* *
* @param idxStream The stream of bit counts to decrement. * @param idx the index.
* @throws IllegalStateException If the count for a decremented bit was already at zero
*/ */
private void remove(final Stream<Integer> idxStream) { private void subtractIndex(int idx) {
idxStream.forEach(idx -> { final Integer val = counts.get(idx);
// TODO - This doubles up removal of the index. if (val != null) {
// The first part will not throw if the count decrements past zero. if (val == 1) {
// The second part will throw if the count decrements past zero.
final Integer val = counts.get(idx);
if (val != null) {
if (val - 1 == 0) {
counts.remove(idx);
} else {
counts.put(idx, val - 1);
}
}
if (val == null || val == 0) {
throw new IllegalStateException("Underflow on index " + idx);
} else if (val - 1 == 0) {
counts.remove(idx); counts.remove(idx);
} else { } else {
counts.put(idx, val - 1); counts.put(idx, val - 1);
} }
}); } else {
throw new IllegalStateException("Underflow on index " + idx);
}
} }
/**
* Convert the hasher to a stream. Duplicates indices are removed.
*
* @param hasher the hasher
* @return the stream
*/
private Stream<Integer> toStream(Hasher hasher) {
final Set<Integer> lst = new HashSet<>();
hasher.getBits(getShape()).forEachRemaining((Consumer<Integer>) lst::add);
return lst.stream();
}
@Override @Override
public String toString() { public String toString() {

View File

@ -24,12 +24,13 @@ import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.PrimitiveIterator.OfInt;
import org.apache.commons.collections4.bloomfilter.hasher.HashFunctionIdentity;
import org.apache.commons.collections4.bloomfilter.hasher.Hasher; import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
import org.apache.commons.collections4.bloomfilter.hasher.Shape; import org.apache.commons.collections4.bloomfilter.hasher.Shape;
import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test; import org.junit.Test;
/** /**
@ -140,33 +141,35 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest {
* Tests that merge correctly updates the counts when a CountingBloomFilter is passed. * Tests that merge correctly updates the counts when a CountingBloomFilter is passed.
*/ */
@Test @Test
@Ignore
public void mergeTest_Counts() { public void mergeTest_Counts() {
final int[] c1 = {1, 10, 100, 0}; final int[] expected = {
final int[] c2 = {4, 0, 7, 2}; 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0
};
final List<Integer> lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
final Hasher hasher = new StaticHasher(lst.iterator(), shape);
final Map<Integer, Integer> map1 = new HashMap<>(); final CountingBloomFilter bf = createFilter(hasher, shape);
for (int i = 0; i < c1.length; i++) {
map1.put(i, c1[i]);
}
final Map<Integer, Integer> map2 = new HashMap<>();
for (int i = 0; i < c2.length; i++) {
map2.put(i, c2[i]);
}
final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape); final List<Integer> lst2 = Arrays.asList(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27);
final BloomFilter bf2 = new CountingBloomFilter(map2, shape); final Hasher hasher2 = new StaticHasher(lst2.iterator(), shape);
final BloomFilter bf2 = createFilter(hasher2, shape);
bf1.merge(bf2); bf.merge(bf2);
assertEquals(4, bf1.cardinality()); assertEquals(27, bf.getCounts().count());
assertEquals(Integer.valueOf(2), bf.getCounts().map(Map.Entry::getValue).max(Integer::compare).get());
assertEquals(Integer.valueOf(1), bf.getCounts().map(Map.Entry::getValue).min(Integer::compare).get());
final Map<Integer, Integer> m = new HashMap<>(); final Map<Integer, Integer> m = new HashMap<>();
bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue())); bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
for (int i = 0; i < 29; i++) {
// This currently fails as the counts from the second filter are ignored. if (m.get(i) == null) {
for (int i = 0; i < c1.length; i++) { assertEquals("Wrong value for " + i, expected[i], 0);
assertEquals(c1[i] + c2[i], m.get(i).intValue()); } else {
assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue());
}
} }
} }
@ -238,7 +241,7 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest {
} }
/** /**
* Test that merge correctly updates the counts when a Hasher is passed * Test that merge correctly updates the counts when a Hasher is passed.
*/ */
@Test @Test
public void mergeTest_Shape_Hasher_Count() { public void mergeTest_Shape_Hasher_Count() {
@ -273,42 +276,82 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest {
} }
} }
/**
* Test that merge correctly updates the counts when a Hasher is passed with duplicates.
*/
@Test
public void mergeTest_Shape_Hasher_Duplicates() {
final int[] expected = {
0, 2, 1, 2, 1
};
final List<Integer> lst = Arrays.asList(1, 2, 3);
final Hasher hasher = new StaticHasher(lst.iterator(), shape);
final CountingBloomFilter bf = createFilter(hasher, shape);
final Hasher hasher2 = new Hasher() {
@Override
public OfInt getBits(Shape shape) {
// Deliberately return duplicates
return Arrays.asList(1, 1, 1, 1, 3, 3, 4).stream().mapToInt(Integer::intValue).iterator();
}
@Override
public HashFunctionIdentity getHashFunctionIdentity() {
return shape.getHashFunctionIdentity();
}
@Override
public boolean isEmpty() {
return false;
}
};
bf.merge(hasher2);
assertEquals(4, bf.getCounts().count());
final Map<Integer, Integer> m = new HashMap<>();
bf.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
for (int i = 0; i < expected.length; i++) {
if (m.get(i) == null) {
assertEquals("Wrong value for " + i, expected[i], 0);
} else {
assertEquals("Wrong value for " + i, expected[i], m.get(i).intValue());
}
}
}
/** /**
* Tests that when removing a counting Bloom filter the counts are correctly updated. * Tests that when removing a counting Bloom filter the counts are correctly updated.
*/ */
@Test @Test
@Ignore
public void removeTest_Counting() { public void removeTest_Counting() {
final int[] c1 = {1, 10, 100, 0}; final int[] values = {
final int[] c2 = {1, 0, 7, 2}; 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
// If left alone the test fails with an exception as there is underflow 1, 1, 1, 1, 1, 1, 1, 1
// for (0 - 2). Fix this but the underflow behaviour is subject to change. };
c2[c2.length - 1] = 0; final Map<Integer, Integer> map = new HashMap<>();
for (int i = 1; i < values.length; i++) {
final Map<Integer, Integer> map1 = new HashMap<>(); map.put(i, values[i]);
for (int i = 0; i < c1.length; i++) {
map1.put(i, c1[i]);
} }
final CountingBloomFilter bf = new CountingBloomFilter(map, shape);
final List<Integer> lst = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
final Hasher hasher = new StaticHasher(lst.iterator(), shape);
final BloomFilter bf2 = new CountingBloomFilter(hasher, shape);
bf.remove(bf2);
assertEquals(17, bf.cardinality());
final Map<Integer, Integer> map2 = new HashMap<>(); final Map<Integer, Integer> map2 = new HashMap<>();
for (int i = 0; i < c2.length; i++) { bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue()));
map2.put(i, c2[i]);
}
final CountingBloomFilter bf1 = new CountingBloomFilter(map1, shape); for (int i = 11; i < values.length; i++) {
final BloomFilter bf2 = new CountingBloomFilter(map2, shape); assertNotNull(map2.get(i));
assertEquals(1, map2.get(i).intValue());
bf1.remove(bf2);
assertEquals(2, bf1.cardinality());
final Map<Integer, Integer> m = new HashMap<>();
bf1.getCounts().forEach(e -> m.put(e.getKey(), e.getValue()));
// This currently fails as the counts from the second filter are ignored
// and only 1 is subtracted for each index.
for (int i = 0; i < c1.length; i++) {
assertEquals(Math.max(0, c1[i] - c2[i]), m.getOrDefault(i, 0).intValue());
} }
} }
@ -343,6 +386,41 @@ public class CountingBloomFilterTest extends AbstractBloomFilterTest {
} }
} }
/**
* Tests that removing a hasher update the counts properly if there are duplicates.
*/
@Test
public void removeTest_Hasher_Duplicates() {
final int[] values = {
0, 3, 2, 1
};
final int[] expected = {
0, 2, 1, 0
};
final Map<Integer, Integer> map = new HashMap<>();
for (int i = 1; i < values.length; i++) {
map.put(i, values[i]);
}
final CountingBloomFilter bf = new CountingBloomFilter(map, shape);
final List<Integer> lst = Arrays.asList(1, 1, 1, 1, 1, 1, 2, 3, 3, 3);
final Hasher hasher = new StaticHasher(lst.iterator(), shape);
bf.remove(hasher);
assertEquals(2, bf.cardinality());
final Map<Integer, Integer> map2 = new HashMap<>();
bf.getCounts().forEach(e -> map2.put(e.getKey(), e.getValue()));
for (int i = 0; i < expected.length; i++) {
if (map2.get(i) == null) {
assertEquals("Wrong value for " + i, expected[i], 0);
} else {
assertEquals("Wrong value for " + i, expected[i], map2.get(i).intValue());
}
}
}
/** /**
* Tests that when removing a standard Bloom filter the counts are correctly updated. * Tests that when removing a standard Bloom filter the counts are correctly updated.
*/ */