Standardise Shape constructor validations.

Standardise the constructor assertions to functions.

Ensure Shape catches NaN probability in the constructor.

Previously NaN would result in a NaN computation for the number of bits.
When cast to int it would be zero. This change improves the error
message in the exception.

Clean-up javadocs.

Ensure Shape is final. If not final then the rest of the Bloom filter
API cannot assume that a Shape is valid as it may be extended and the
computations changed.
This commit is contained in:
Alex Herbert 2020-03-10 00:29:04 +00:00
parent cb88c4ed01
commit 0964d5bf19
2 changed files with 212 additions and 131 deletions

View File

@ -43,30 +43,30 @@ import java.util.Objects;
* [Wikipedia]</a> * [Wikipedia]</a>
* @since 4.5 * @since 4.5
*/ */
public class Shape { public final class Shape {
/** /**
* The natural logarithm of 2. Used in several calculations. approx 0.693147180 * The natural logarithm of 2. Used in several calculations. Approximately 0.693147180.
*/ */
private static final double LOG_OF_2 = Math.log(2.0); private static final double LOG_OF_2 = Math.log(2.0);
/** /**
* 1 / 2^log(2) approx -0.090619058. Used in calculating the number of bits. * 1 / 2^log(2). Used in calculating the number of bits. Approximately -0.090619058.
*/ */
private static final double DENOMINATOR = Math.log(1.0 / (Math.pow(2.0, LOG_OF_2))); private static final double DENOMINATOR = Math.log(1.0 / Math.pow(2.0, LOG_OF_2));
/** /**
* number of items in the filter. (AKA: {@code n}) * Number of items in the filter. AKA: {@code n}.
*/ */
private final int numberOfItems; private final int numberOfItems;
/** /**
* number of bits in the filter. (AKA: {@code m}) * Number of bits in the filter. AKA: {@code m}.
*/ */
private final int numberOfBits; private final int numberOfBits;
/** /**
* number of hash functions. (AKA: {@code k}) * Number of hash functions. AKA: {@code k}.
*/ */
private final int numberOfHashFunctions; private final int numberOfHashFunctions;
@ -81,34 +81,36 @@ public class Shape {
private final HashFunctionIdentity hashFunctionIdentity; private final HashFunctionIdentity hashFunctionIdentity;
/** /**
* Constructs a filter configuration with the specified number of items and * Constructs a filter configuration with a desired false-positive probability ({@code p}) and the
* probability. * specified number of bits ({@code m}) and hash functions ({@code k}).
* *
* @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. * <p>The number of items ({@code n}) to be stored in the filter is computed.
* @param probability The probability of duplicates. Must be in the range * <pre>n = ceil(m / (-k / log(1 - exp(log(p) / k))))</pre>
* (0.0,1.0). *
* @param numberOfBits The number of bits in the filter. * <p>The actual probability will be approximately equal to the
* @param numberOfHashFunctions The number of hash functions in the filter. * desired probability but will be dependent upon the calculated Bloom filter capacity
* (number of items). An exception is raised if this is greater than or equal to 1 (i.e. the
* shape is invalid for use as a Bloom filter).
*
* @param hashFunctionIdentity The identity of the hash function this shape uses
* @param probability The desired false-positive probability in the range {@code (0, 1)}
* @param numberOfBits The number of bits in the filter
* @param numberOfHashFunctions The number of hash functions in the filter
* @throws NullPointerException if the hash function identity is null
* @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}
* @throws IllegalArgumentException if the number of bits is not above 8
* @throws IllegalArgumentException if the number of hash functions is not strictly positive
* @throws IllegalArgumentException if the calculated probability is not below 1
* @see #getProbability()
*/ */
public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits, public Shape(final HashFunctionIdentity hashFunctionIdentity, final double probability, final int numberOfBits,
final int numberOfHashFunctions) { final int numberOfHashFunctions) {
Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity");
if (probability <= 0.0) { checkProbability(probability);
throw new IllegalArgumentException("Probability must be greater than 0.0"); this.numberOfBits = checkNumberOfBits(numberOfBits);
} this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
if (probability >= 1.0) {
throw new IllegalArgumentException("Probability must be less than 1.0");
}
if (numberOfBits < 8) {
throw new IllegalArgumentException("Number of bits must be greater than or equal to 8");
}
if (numberOfHashFunctions < 1) {
throw new IllegalArgumentException("Number of hash functions must be greater than or equal to 8");
}
this.hashFunctionIdentity = hashFunctionIdentity;
this.numberOfBits = numberOfBits;
this.numberOfHashFunctions = numberOfHashFunctions;
// Number of items (n):
// n = ceil(m / (-k / log(1 - exp(log(p) / k)))) // n = ceil(m / (-k / log(1 - exp(log(p) / k))))
final double n = Math.ceil(numberOfBits / final double n = Math.ceil(numberOfBits /
(-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions)))); (-numberOfHashFunctions / Math.log(1 - Math.exp(Math.log(probability) / numberOfHashFunctions))));
@ -124,103 +126,188 @@ public class Shape {
// similarly we can not produce a number greater than numberOfBits so we // similarly we can not produce a number greater than numberOfBits so we
// do not have to check for Integer.MAX_VALUE either. // do not have to check for Integer.MAX_VALUE either.
this.numberOfItems = (int) n; this.numberOfItems = (int) n;
this.hashCode = generateHashCode();
// check that probability is within range // check that probability is within range
getProbability(); checkCalculatedProbability(getProbability());
this.hashCode = generateHashCode();
} }
/** /**
* Constructs a filter configuration with the specified number of items and * Constructs a filter configuration with the specified number of items ({@code n}) and
* probability. <p> The actual probability will be approximately equal to the * desired false-positive probability ({@code p}).
* desired probability but will be dependent upon the calculated bloom filter size
* and function count. </p>
* *
* @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. * <p>The number of bits ({@code m}) for the filter is computed.
* @param numberOfItems Number of items to be placed in the filter. * <pre>m = ceil(n * log(p) / log(1 / 2^log(2)))</pre>
* @param probability The desired probability of duplicates. Must be in the range *
* (0.0,1.0). * <p>The optimal number of hash functions ({@code k}) is computed.
* <pre>k = round((m / n) * log(2))</pre>
*
* <p>The actual probability will be approximately equal to the
* desired probability but will be dependent upon the calculated number of bits and hash
* functions. An exception is raised if this is greater than or equal to 1 (i.e. the
* shape is invalid for use as a Bloom filter).
*
* @param hashFunctionIdentity The identity of the hash function this shape uses
* @param numberOfItems Number of items to be placed in the filter
* @param probability The desired false-positive probability in the range {@code (0, 1)}
* @throws NullPointerException if the hash function identity is null
* @throws IllegalArgumentException if the number of items is not strictly positive
* @throws IllegalArgumentException if the desired probability is not in the range {@code (0, 1)}
* @throws IllegalArgumentException if the calculated probability is not below 1
* @see #getProbability()
*/ */
public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) { public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final double probability) {
Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity");
if (numberOfItems < 1) { this.numberOfItems = checkNumberOfItems(numberOfItems);
throw new IllegalArgumentException("Number of Items must be greater than 0"); checkProbability(probability);
}
if (probability <= 0.0) { // Number of bits (m)
throw new IllegalArgumentException("Probability must be greater than 0.0");
}
if (probability >= 1.0) {
throw new IllegalArgumentException("Probability must be less than 1.0");
}
this.hashFunctionIdentity = hashFunctionIdentity;
this.numberOfItems = numberOfItems;
/*
* number of bits is called "m" in most mathematical statement describing
* bloom filters so we use it here.
*/
final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR); final double m = Math.ceil(numberOfItems * Math.log(probability) / DENOMINATOR);
if (m > Integer.MAX_VALUE) { if (m > Integer.MAX_VALUE) {
throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits"); throw new IllegalArgumentException("Resulting filter has more than " + Integer.MAX_VALUE + " bits: " + m);
} }
this.numberOfBits = (int) m; this.numberOfBits = (int) m;
this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
this.hashCode = generateHashCode();
// check that probability is within range // check that probability is within range
getProbability(); checkCalculatedProbability(getProbability());
this.hashCode = generateHashCode();
} }
/** /**
* Constructs a filter configuration with the specified number of items and * Constructs a filter configuration with the specified number of items ({@code n}) and
* probability. * bits ({@code m}).
* *
* @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. * <p>The optimal number of hash functions ({@code k}) is computed.
* @param numberOfItems Number of items to be placed in the filter. * <pre>k = round((m / n) * log(2))</pre>
* @param numberOfBits The number of bits in the filter. *
* <p>The false-positive probability is computed using the number of items, bits and hash
* functions. An exception is raised if this is greater than or equal to 1 (i.e. the
* shape is invalid for use as a Bloom filter).
*
* @param hashFunctionIdentity The identity of the hash function this shape uses
* @param numberOfItems Number of items to be placed in the filter
* @param numberOfBits The number of bits in the filter
* @throws NullPointerException if the hash function identity is null
* @throws IllegalArgumentException if the number of items is not strictly positive
* @throws IllegalArgumentException if the number of bits is not above 8
* @throws IllegalArgumentException if the calculated number of hash function is below 1
* @throws IllegalArgumentException if the calculated probability is not below 1
* @see #getProbability()
*/ */
public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) { public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits) {
Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity");
if (numberOfItems < 1) { this.numberOfItems = checkNumberOfItems(numberOfItems);
throw new IllegalArgumentException("Number of Items must be greater than 0"); this.numberOfBits = checkNumberOfBits(numberOfBits);
}
if (numberOfBits < 8) {
throw new IllegalArgumentException("Number of Bits must be greater than or equal to 8");
}
this.hashFunctionIdentity = hashFunctionIdentity;
this.numberOfItems = numberOfItems;
this.numberOfBits = numberOfBits;
this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits); this.numberOfHashFunctions = calculateNumberOfHashFunctions(numberOfItems, numberOfBits);
this.hashCode = generateHashCode();
// check that probability is within range // check that probability is within range
getProbability(); checkCalculatedProbability(getProbability());
this.hashCode = generateHashCode();
} }
/** /**
* Constructs a filter configuration with the specified number of items and * Constructs a filter configuration with the specified number of items, bits
* probability. * and hash functions.
* *
* @param hashFunctionIdentity The HashFunctionIdentity of the hash function this shape uses. * <p>The false-positive probability is computed using the number of items, bits and hash
* @param numberOfItems Number of items to be placed in the filter. * functions. An exception is raised if this is greater than or equal to 1 (i.e. the
* shape is invalid for use as a Bloom filter).
*
* @param hashFunctionIdentity The identity of the hash function this shape uses
* @param numberOfItems Number of items to be placed in the filter
* @param numberOfBits The number of bits in the filter. * @param numberOfBits The number of bits in the filter.
* @param numberOfHashFunctions The number of hash functions in the filter. * @param numberOfHashFunctions The number of hash functions in the filter
* @throws NullPointerException if the hash function identity is null
* @throws IllegalArgumentException if the number of items is not strictly positive
* @throws IllegalArgumentException if the number of bits is not above 8
* @throws IllegalArgumentException if the number of hash functions is not strictly positive
* @throws IllegalArgumentException if the calculated probability is not below 1
* @see #getProbability()
*/ */
public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits, public Shape(final HashFunctionIdentity hashFunctionIdentity, final int numberOfItems, final int numberOfBits,
final int numberOfHashFunctions) { final int numberOfHashFunctions) {
Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity"); this.hashFunctionIdentity = Objects.requireNonNull(hashFunctionIdentity, "hashFunctionIdentity");
if (numberOfItems < 1) { this.numberOfItems = checkNumberOfItems(numberOfItems);
throw new IllegalArgumentException("Number of Items must be greater than 0"); this.numberOfBits = checkNumberOfBits(numberOfBits);
} this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions);
if (numberOfBits < 8) {
throw new IllegalArgumentException("Number of Bits must be greater than or equal to 8");
}
if (numberOfHashFunctions < 1) {
throw new IllegalArgumentException("Number of Hash Functions must be greater than or equal to 8");
}
this.hashFunctionIdentity = hashFunctionIdentity;
this.numberOfItems = numberOfItems;
this.numberOfBits = numberOfBits;
this.numberOfHashFunctions = numberOfHashFunctions;
this.hashCode = generateHashCode();
// check that probability is within range // check that probability is within range
getProbability(); checkCalculatedProbability(getProbability());
this.hashCode = generateHashCode();
}
/**
* Check number of items is strictly positive.
*
* @param numberOfItems the number of items
* @return the number of items
* @throws IllegalArgumentException if the number of items is not strictly positive
*/
private static int checkNumberOfItems(final int numberOfItems) {
if (numberOfItems < 1) {
throw new IllegalArgumentException("Number of items must be greater than 0: " + numberOfItems);
}
return numberOfItems;
}
/**
* Check number of bits is above 8.
*
* @param numberOfBits the number of bits
* @return the number of bits
* @throws IllegalArgumentException if the number of bits is not above 8
*/
private static int checkNumberOfBits(final int numberOfBits) {
if (numberOfBits < 8) {
throw new IllegalArgumentException("Number of bits must be greater than or equal to 8: " + numberOfBits);
}
return numberOfBits;
}
/**
* Check number of hash functions is strictly positive
*
* @param numberOfHashFunctions the number of hash functions
* @return the number of hash functions
* @throws IllegalArgumentException if the number of hash functions is not strictly positive
*/
private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) {
if (numberOfHashFunctions < 1) {
throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions);
}
return numberOfHashFunctions;
}
/**
* Check the probability is in the range 0.0, exclusive, to 1.0, exclusive.
*
* @param probability the probability
* @throws IllegalArgumentException if the probability is not in the range {@code (0, 1)}
*/
private static void checkProbability(final double probability) {
// Using the negation of within the desired range will catch NaN
if (!(probability > 0.0 && probability < 1.0)) {
throw new IllegalArgumentException("Probability must be greater than 0 and less than 1: " + probability);
}
}
/**
* Check the calculated probability is below 1.0.
*
* <p>This function is used to verify that the dynamically calculated probability for the
* Shape is in the valid range 0 to 1 exclusive. This need only be performed once upon
* construction.
*
* @param probability the probability
* @throws IllegalArgumentException if the calculated probability is not below 1
*/
private static void checkCalculatedProbability(final double probability) {
// We do not need to check for p < = since we only allow positive values for
// parameters and the closest we can come to exp(-kn/m) == 1 is
// exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will
// always be 0<x<1 and y>0
if (probability >= 1.0) {
throw new IllegalArgumentException(
String.format("Calculated probability is greater than or equal to 1: " + probability));
}
} }
/** /**
@ -230,23 +317,20 @@ public class Shape {
* @param numberOfItems the number of items in the filter. * @param numberOfItems the number of items in the filter.
* @param numberOfBits the number of bits in the filter. * @param numberOfBits the number of bits in the filter.
* @return the optimal number of hash functions. * @return the optimal number of hash functions.
* @throws IllegalArgumentException if the calculated number of hash function is below 1
*/ */
private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) { private static int calculateNumberOfHashFunctions(final int numberOfItems, final int numberOfBits) {
/* // k = round((m / n) * log(2)) We change order so that we use real math rather
* k = round((m / n) * log(2)) We change order so that we use real math rather // than integer math.
* than integer math.
*/
final long k = Math.round(LOG_OF_2 * numberOfBits / numberOfItems); final long k = Math.round(LOG_OF_2 * numberOfBits / numberOfItems);
if (k < 1) { if (k < 1) {
throw new IllegalArgumentException( throw new IllegalArgumentException(
String.format("Filter to small: Calculated number of hash functions (%s) was less than 1", k)); String.format("Filter too small: Calculated number of hash functions (%s) was less than 1", k));
} }
/* // Normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but
* normally we would check that numberofHashFunctions <= Integer.MAX_VALUE but // since numberOfBits is at most Integer.MAX_VALUE the numerator of
* since numberOfBits is at most Integer.MAX_VALUE the numerator of // numberofHashFunctions is log(2) * Integer.MAX_VALUE = 646456992.9449 the
* numberofHashFunctions is log(2) * Integer.MAX_VALUE = 646456992.9449 the // value of k can not be above Integer.MAX_VALUE.
* value of k can not be above Integer.MAX_VALUE.
*/
return (int) k; return (int) k;
} }
@ -255,8 +339,8 @@ public class Shape {
if (o instanceof Shape) { if (o instanceof Shape) {
final Shape other = (Shape) o; final Shape other = (Shape) o;
return return
other.getNumberOfBits() == getNumberOfBits() && getNumberOfBits() == other.getNumberOfBits() &&
other.getNumberOfHashFunctions() == getNumberOfHashFunctions() && getNumberOfHashFunctions() == other.getNumberOfHashFunctions() &&
HashFunctionValidator.areEqual(getHashFunctionIdentity(), HashFunctionValidator.areEqual(getHashFunctionIdentity(),
other.getHashFunctionIdentity()); other.getHashFunctionIdentity());
} }
@ -276,7 +360,7 @@ public class Shape {
} }
/** /**
* Gets the number of bits in the Bloom filter. AKA: {@code m} * Gets the number of bits in the Bloom filter. AKA: {@code m}.
* *
* @return the number of bits in the Bloom filter. * @return the number of bits in the Bloom filter.
*/ */
@ -290,11 +374,11 @@ public class Shape {
* @return the number of bytes in the Bloom filter. * @return the number of bytes in the Bloom filter.
*/ */
public int getNumberOfBytes() { public int getNumberOfBytes() {
return Double.valueOf(Math.ceil(numberOfBits / (double) Byte.SIZE )).intValue(); return Double.valueOf(Math.ceil(numberOfBits / (double) Byte.SIZE)).intValue();
} }
/** /**
* Gets the number of hash functions used to construct the filter. AKA: {@code k} * Gets the number of hash functions used to construct the filter. AKA: {@code k}.
* *
* @return the number of hash functions used to construct the filter. * @return the number of hash functions used to construct the filter.
*/ */
@ -303,7 +387,7 @@ public class Shape {
} }
/** /**
* Gets the number of items that are expected in the filter. AKA: {@code n} * Gets the number of items that are expected in the filter. AKA: {@code n}.
* *
* @return the number of items. * @return the number of items.
*/ */
@ -312,27 +396,18 @@ public class Shape {
} }
/** /**
* Calculates the probability of false positives (AKA: {@code p} given * Calculates the probability of false positives ({@code p}) given
* numberOfItems, numberofBits and numberOfHashFunctions. This is a method so that * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}).
* the calculation is consistent across all constructors. * <pre>p = (1 - exp(-kn/m))^k</pre>
*
* <p>This is the probability that a Bloom filter will return true for the presence of an item
* when it does not contain the item.
* *
* @return the probability of collision. * @return the probability of collision.
*/ */
public final double getProbability() { public double getProbability() {
// (1 - exp(-kn/m))^k return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits),
final double p = Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits),
numberOfHashFunctions); numberOfHashFunctions);
/*
* We do not need to check for p < = since we only allow positive values for
* parameters and the closest we can come to exp(-kn/m) == 1 is
* exp(-1/Integer.MAX_INT) approx 0.9999999995343387 so Math.pow( x, y ) will
* always be 0<x<1 and y>0
*/
if (p >= 1.0) {
throw new IllegalArgumentException(
String.format("Calculated probability (%s) is greater than or equal to 1.0", p));
}
return p;
} }
@Override @Override

View File

@ -227,6 +227,12 @@ public class ShapeTest {
} catch (final IllegalArgumentException expected) { } catch (final IllegalArgumentException expected) {
// do nothing. // do nothing.
} }
try {
new Shape(testFunction, 10, Double.NaN);
fail("Should have thrown IllegalArgumentException");
} catch (final IllegalArgumentException expected) {
// do nothing.
}
} }
/** /**