Collections-817: Update estimateN, estimateIntersection and

estimateUnion

Reworked calculations and updated javadoc.

Added documentation concerning n, intersection, and union calculations
in Shape class.

Modified estimatedIntersection to account for infinite values in
calculations.

This commit addresses a merge error that omitted all fixes  intended for
the changes in 64ed061e6a2a89f251ebd26dd52650d56025b5c1
This commit is contained in:
aherbert 2022-11-30 16:30:15 +00:00
parent 5bcab481f7
commit a1de787172
8 changed files with 255 additions and 23 deletions

View File

@ -150,6 +150,7 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
*
* @param hasher The hasher to merge.
* @return true if the merge was successful
* @throws IllegalArgumentException if hasher produces an illegal value.
*/
default boolean merge(final Hasher hasher) {
Objects.requireNonNull(hasher, "hasher");
@ -214,26 +215,48 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
* <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the
* shape and cardinality of this filter.</p>
*
* <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter.</p>
* <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter
* by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
*
* @return an estimate of the number of items in the bloom filter.
* <p><em>Note:</em></p>
* <ul>
* <li>if cardinality == numberOfBits, then result is Integer.MAX_VALUE.</li>
* <li>if cardinality &gt; numberOfBits, then an IllegalArgumentException is thrown.</li>
* </ul>
*
* @return an estimate of the number of items in the bloom filter. Will return Integer.MAX_VALUE if the
* estimate is larger than Integer.MAX_VALUE.
* @throws IllegalArgumentException if the cardinality is &gt; numberOfBits as defined in Shape.
* @see Shape#estimateN(int)
* @see Shape
*/
default int estimateN() {
return (int) Math.round(getShape().estimateN(cardinality()));
double d = getShape().estimateN(cardinality());
if (Double.isInfinite(d)) {
return Integer.MAX_VALUE;
}
if (Double.isNaN(d)) {
throw new IllegalArgumentException("Cardinality too large: " + cardinality());
}
long l = Math.round(d);
return l > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) l;
}
/**
* Estimates the number of items in the union of this Bloom filter with the other bloom filter.
*
* <p>By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.</p>
*
* <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either
* of the filters.</p>
* of the filters by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
*
* <p><em>{@code estimateUnion} should only be called with Bloom filters of the same Shape. If called on Bloom
* filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
* may be thrown.</em></p>
*
* @param other The other Bloom filter
* @return an estimate of the number of items in the union.
* @return an estimate of the number of items in the union. Will return Integer.MAX_VALUE if the
* estimate is larger than Integer.MAX_VALUE.
* @see #estimateN()
* @see Shape
*/
default int estimateUnion(final BloomFilter other) {
Objects.requireNonNull(other, "other");
@ -245,16 +268,44 @@ public interface BloomFilter extends IndexProducer, BitMapProducer {
/**
* Estimates the number of items in the intersection of this Bloom filter with the other bloom filter.
*
* <p>By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)} </p>
* <p>This method produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
* of the filters by rounding the value from the calculation described in the {@link Shape} class javadoc.</p>
*
* <p>This produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both
* of the filters.</p>
* <p><em>{@code estimateIntersection} should only be called with Bloom filters of the same Shape. If called on Bloom
* filters of differing shape this method is not symmetric. If {@code other} has more bits an {@code IllegalArgumentException}
* may be thrown.</em></p>
*
* @param other The other Bloom filter
* @return an estimate of the number of items in the intersection.
* @return an estimate of the number of items in the intersection. If the calculated estimate is larger than Integer.MAX_VALUE then MAX_VALUE is returned.
* @throws IllegalArgumentException if the estimated N for the union of the filters is infinite.
* @see #estimateN()
* @see Shape
*/
default int estimateIntersection(final BloomFilter other) {
Objects.requireNonNull(other, "other");
return estimateN() + other.estimateN() - estimateUnion(other);
double eThis = getShape().estimateN(cardinality());
double eOther = getShape().estimateN(other.cardinality());
if (Double.isInfinite(eThis) && Double.isInfinite(eOther)) {
// if both are infinite the union is infinite and we return Integer.MAX_VALUE
return Integer.MAX_VALUE;
}
long estimate;
// if one is infinite the intersection is the other.
if (Double.isInfinite(eThis)) {
estimate = Math.round(eOther);
} else if (Double.isInfinite(eOther)) {
estimate = Math.round(eThis);
} else {
BloomFilter union = this.copy();
union.merge(other);
double eUnion = getShape().estimateN(union.cardinality());
if (Double.isInfinite(eUnion)) {
throw new IllegalArgumentException("The estimated N for the union of the filters is infinite");
}
// maximum estimate value using integer values is: 46144189292 thus
// eThis + eOther can not overflow the long value.
estimate = Math.round(eThis + eOther - eUnion);
}
return estimate>Integer.MAX_VALUE?Integer.MAX_VALUE:(int) estimate;
}
}

View File

@ -36,6 +36,48 @@ package org.apache.commons.collections4.bloomfilter;
* <dd>{@code k = round((m / n) * ln(2))}</dd>
* </dl>
*
* <h2>Estimations from cardinality based on shape</h2>
*
* <p>Several estimates can be calculated from the Shape and the cardinality of a Bloom filter.</p>
*
* <p>In the calculation below the following values are used:</p>
* <ul>
* <li>double c = the cardinality of the Bloom filter.</li>
* <li>double m = numberOfBits as specified in the shape.</li>
* <li>double k = numberOfHashFunctions as specified in the shape.</li>
* </ul>
*
* <h3>Estimate N - n()</h3>
*
* <p>The calculation for the estimate of N is: {@code -(m/k) * ln(1 - (c/m))}. This is the calculation
* performed by the {@code Shape.estimateN(cardinality)} method below. This estimate is roughly equivalent to the
* number of hashers that have been merged into a filter to create the cardinality specified.</p>
*
* <p><em>Note:</em></p>
* <ul>
* <li>if cardinality == numberOfBits, then result is infinity.</li>
* <li>if cardinality &gt; numberOfBits, then result is NaN.</li>
* </ul>
*
* <h3>Estimate N of Union - n(A &cup; B)</h3>
*
* <p>To estimate the number of items in the union of two Bloom filters with the same shape, merge them together and
* calculate the estimated N from the result.</p>
*
* <h3>Estimate N of the Intersection - n(A &cap; B)</h3>
*
* <p>To estimate the number of items in the intersection of two Bloom filters A and B with the same shape the calculation is:
* n(A) + n(b) - n(A &cup; B).</p>
*
* <p>Care must be taken when any of the n(x) returns infinity. In general the following assumptions are true:
*
* <ul>
* <li>If n(A) = &infin; and n(B) &lt; &infin; then n(A &cap; B) = n(B)</li>
* <li>If n(A) &lt; &infin; and n(B) = &infin; then n(A &cap; B) = n(A)</li>
* <li>If n(A) = &infin; and n(B) = &infin; then n(A &cap; B) = &infin;</li>
* <li>If n(A) &lt; &infin; and n(B) &lt; &infin; and n(A &cup; B) = &infin; then n(A &cap; B) is undefined.</li>
* </ul>
*
* @see <a href="https://hur.st/bloomfilter">Bloom Filter calculator</a>
* @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter
* [Wikipedia]</a>

View File

@ -114,10 +114,6 @@ public final class SimpleBloomFilter implements BloomFilter {
// idx[0] will be limit+1 so decrement it
idx[0]--;
final int idxLimit = BitMap.getLongIndex(shape.getNumberOfBits());
if (idxLimit < idx[0]) {
throw new IllegalArgumentException(String.format(
"BitMapProducer set a bit higher than the limit for the shape: %s", shape.getNumberOfBits() - 1));
}
if (idxLimit == idx[0]) {
final long excess = bitMap[idxLimit] >> shape.getNumberOfBits();
if (excess != 0) {

View File

@ -103,6 +103,18 @@ public abstract class AbstractBitMapProducerTest {
Arrays.fill(count, 0);
createProducer().forEachBitMapPair(createEmptyProducer(), lbp);
assertEquals(count[2], count[1]);
// test where the created producer does not process all records because the predicate function
// returns false before the processing is completed.
int[] limit = new int[1];
final LongBiPredicate shortFunc = (x, y) -> {
limit[0]++;
return limit[0] < 2;
};
final BitMapProducer shortProducer = l -> {
return true;
};
assertFalse(createProducer().forEachBitMapPair(shortProducer, shortFunc));
}
@Test

View File

@ -215,7 +215,7 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
}
/**
* Tests that the andCardinality calculations are correct.
* Tests that the estimated intersection calculations are correct.
*/
@Test
public final void testEstimateIntersection() {
@ -223,14 +223,27 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
final BloomFilter bf = createFilter(getTestShape(), TestingHashers.FROM1);
final BloomFilter bf2 = TestingHashers.populateFromHashersFrom1AndFrom11(createEmptyFilter(getTestShape()));
final BloomFilter bf3 = TestingHashers.populateEntireFilter(createEmptyFilter(getTestShape()));
assertEquals(1, bf.estimateIntersection(bf2));
assertEquals(1, bf2.estimateIntersection(bf));
assertEquals(1, bf.estimateIntersection(bf3));
assertEquals(1, bf2.estimateIntersection(bf));
assertEquals(2, bf3.estimateIntersection(bf2));
final BloomFilter bf3 = createEmptyFilter(getTestShape());
final BloomFilter bf4 = createEmptyFilter(getTestShape());
assertEquals(0, bf.estimateIntersection(bf3));
assertEquals(0, bf3.estimateIntersection(bf));
assertEquals(0, bf.estimateIntersection(bf4));
assertEquals(0, bf4.estimateIntersection(bf));
BloomFilter bf5 = TestingHashers.mergeHashers(createEmptyFilter(getTestShape()), new IncrementingHasher(0, 1)/* 0-16 */,
new IncrementingHasher(17, 1)/* 17-33 */, new IncrementingHasher(33, 1)/* 33-49 */);
BloomFilter bf6 = TestingHashers.mergeHashers(createEmptyFilter(getTestShape()), new IncrementingHasher(50, 1)/* 50-66 */,
new IncrementingHasher(67, 1)/* 67-83 */);
assertThrows(IllegalArgumentException.class, () -> bf5.estimateIntersection(bf6));
// infinite with infinite
assertEquals(Integer.MAX_VALUE, bf3.estimateIntersection(bf3));
}
/**
@ -256,18 +269,20 @@ public abstract class AbstractBloomFilterTest<T extends BloomFilter> {
@Test
public final void testEstimateN() {
// build a filter
final BloomFilter filter1 = createFilter(getTestShape(), TestingHashers.FROM1);
BloomFilter filter1 = createFilter(getTestShape(), TestingHashers.FROM1);
assertEquals(1, filter1.estimateN());
// the data provided above do not generate an estimate that is equivalent to the
// actual.
filter1.merge(new IncrementingHasher(4, 1));
assertEquals(1, filter1.estimateN());
filter1.merge(new IncrementingHasher(17, 1));
assertEquals(3, filter1.estimateN());
filter1 = TestingHashers.populateEntireFilter(createEmptyFilter(getTestShape()));
assertEquals(Integer.MAX_VALUE, filter1.estimateN());
}
/**

View File

@ -87,4 +87,22 @@ public class DefaultBitMapProducerTest extends AbstractBitMapProducerTest {
final long[] ary = BitMapProducer.fromBitMapArray(expected).asBitMapArray();
assertArrayEquals(expected, ary);
}
@Test
public void testAsBitMapArrayLargeArray() {
final long[] expected = generateLongArray(32);
BitMapProducer producer = new BitMapProducer() {
@Override
public boolean forEachBitMap(LongPredicate predicate) {
for (long l : expected) {
if (!predicate.test(l)) {
return false;
}
}
return true;
}
};
final long[] ary = producer.asBitMapArray();
assertArrayEquals(expected, ary);
}
}

View File

@ -18,6 +18,7 @@ package org.apache.commons.collections4.bloomfilter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.util.TreeSet;
import java.util.function.IntPredicate;
@ -67,6 +68,74 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
.forEachBitMapPair(bf1, (x, y) -> x == y));
}
@Test
public void testEstimateNWithBrokenCardinality() {
// build a filter
BloomFilter filter1 = TestingHashers.populateEntireFilter(new BrokenCardinality(getTestShape()));
assertThrows(IllegalArgumentException.class, () -> filter1.estimateN());
}
@Test
public void testEstimateLargeN() {
Shape s = Shape.fromKM(1, Integer.MAX_VALUE);
// create a very large filter with Integer.MAX_VALUE-1 bits set.
BloomFilter bf1 = new SimpleBloomFilter(s);
bf1.merge((BitMapProducer) predicate -> {
int limit = Integer.MAX_VALUE - 1;
while (limit > 64) {
predicate.test(0xFFFFFFFFFFFFFFFFL);
limit -= 64;
}
long last = 0L;
for (int i = 0; i < limit; i++) {
last |= BitMap.getLongBit(i);
}
predicate.test(last);
return true;
});
// the actual result of the calculation is: 46144189292, so the returned value
// should be Integer.MAX_VALUE.
assertEquals(Integer.MAX_VALUE, bf1.estimateN());
}
@Test
public void testIntersectionLimit() {
Shape s = Shape.fromKM(1, Integer.MAX_VALUE);
// create a very large filter with Integer.MAX_VALUE-1 bit set.
BloomFilter bf1 = new SimpleBloomFilter(s);
bf1.merge((BitMapProducer) predicate -> {
int limit = Integer.MAX_VALUE - 1;
while (limit > 64) {
predicate.test(0xFFFFFFFFFFFFFFFFL);
limit -= 64;
}
long last = 0L;
for (int i = 0; i < limit; i++) {
last |= BitMap.getLongBit(i);
}
predicate.test(last);
return true;
});
// the actual result of the calculation is: 46144189292
assertEquals(Integer.MAX_VALUE, bf1.estimateIntersection(bf1));
}
@Test
public void testSparseNonSparseMerging() {
BloomFilter bf1 = new SparseDefaultBloomFilter(getTestShape());
bf1.merge(TestingHashers.FROM1);
BloomFilter bf2 = new NonSparseDefaultBloomFilter(getTestShape());
bf2.merge(TestingHashers.FROM11);
BloomFilter result = bf1.copy();
result.merge(bf2);
assertEquals(27, result.cardinality());
result = bf2.copy();
result.merge(bf1);
assertEquals(27, result.cardinality());
}
abstract static class AbstractDefaultBloomFilter implements BloomFilter {
private final Shape shape;
protected TreeSet<Integer> indices;
@ -178,9 +247,21 @@ public class DefaultBloomFilterTest extends AbstractBloomFilterTest<DefaultBloom
@Override
public AbstractDefaultBloomFilter copy() {
final AbstractDefaultBloomFilter result = new SparseDefaultBloomFilter(getShape());
final AbstractDefaultBloomFilter result = new NonSparseDefaultBloomFilter(getShape());
result.indices.addAll(indices);
return result;
}
}
static class BrokenCardinality extends NonSparseDefaultBloomFilter {
BrokenCardinality(Shape shape) {
super(shape);
}
@Override
public int cardinality() {
return super.cardinality() + 1;
}
}
}

View File

@ -16,6 +16,11 @@
*/
package org.apache.commons.collections4.bloomfilter;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
/**
* Tests for the {@link SimpleBloomFilter}.
*/
@ -24,4 +29,16 @@ public class SimpleBloomFilterTest extends AbstractBloomFilterTest<SimpleBloomFi
protected SimpleBloomFilter createEmptyFilter(final Shape shape) {
return new SimpleBloomFilter(shape);
}
@Test
public void testMergeShortBitMapProducer() {
SimpleBloomFilter filter = createEmptyFilter(getTestShape());
// create a producer that returns too few values
// shape expects 2 longs we are sending 1.
BitMapProducer producer = p -> {
return p.test(2L);
};
assertTrue(filter.merge(producer));
assertEquals(1, filter.cardinality());
}
}