HBASE-908 Add approximate counting to CountingBloomFilter
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@700631 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5ae403ea35
commit
b45191d2d9
|
@ -23,6 +23,8 @@ Release 0.19.0 - Unreleased
|
|||
should emit a sorted list of tables (Krzysztof Szlapinski via Stack)
|
||||
HBASE-884 Double and float converters for Bytes class
|
||||
(Doğacan Güney via Stack)
|
||||
HBASE-908 Add approximate counting to CountingBloomFilter
|
||||
(Andrzej Bialecki via Stack)
|
||||
|
||||
NEW FEATURES
|
||||
HBASE-875 Use MurmurHash instead of JenkinsHash [in bloomfilters]
|
||||
|
|
|
@ -195,6 +195,44 @@ public final class CountingBloomFilter extends Filter {
|
|||
return true;
|
||||
}//end membershipTest()
|
||||
|
||||
/**
|
||||
* This method calculates an approximate count of the key, i.e. how many
|
||||
* times the key was added to the filter. This allows the filter to be
|
||||
* used as an approximate <code>key -> count</code> map.
|
||||
* <p>NOTE: due to the bucket size of this filter, inserting the same
|
||||
* key more than 15 times will cause an overflow at all filter positions
|
||||
* associated with this key, and it will significantly increase the error
|
||||
* rate for this and other keys. For this reason the filter can only be
|
||||
* used to store small count values <code>0 <= N << 15</code>.
|
||||
* @param key key to be tested
|
||||
* @return 0 if the key is not present. Otherwise, a positive value v will
|
||||
* be returned such that <code>v == count</code> with probability equal to the
|
||||
* error rate of this filter, and <code>v > count</code> otherwise.
|
||||
* Additionally, if the filter experienced an underflow as a result of
|
||||
* {@link #delete(Key)} operation, the return value may be lower than the
|
||||
* <code>count</code> with the probability of the false negative rate of such
|
||||
* filter.
|
||||
*/
|
||||
public int approximateCount(Key key) {
|
||||
int res = Integer.MAX_VALUE;
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
for (int i = 0; i < nbHash; i++) {
|
||||
// find the bucket
|
||||
int wordNum = h[i] >> 4; // div 16
|
||||
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
|
||||
if (bucketValue < res) res = (int)bucketValue;
|
||||
}
|
||||
if (res != Integer.MAX_VALUE) {
|
||||
return res;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void not(){
|
||||
throw new UnsupportedOperationException("not() is undefined for "
|
||||
|
|
|
@ -291,13 +291,28 @@ public class TestFilter extends TestCase {
|
|||
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
|
||||
assertFalse(bf.membershipTest(new StringKey("abcd")));
|
||||
|
||||
// to test for overflows, add 'key' enough times to overflow an 8bit bucket,
|
||||
// to test for overflows, add 'key' enough times to overflow a 4bit bucket,
|
||||
// while asserting that it stays a member
|
||||
for(int i = 0; i < 257; i++){
|
||||
for(int i = 0; i < 16; i++){
|
||||
bf.add(key);
|
||||
assertTrue(bf.membershipTest(key));
|
||||
}
|
||||
|
||||
// test approximateCount
|
||||
CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
|
||||
// test the exact range
|
||||
for (int i = 0; i < 8; i++) {
|
||||
bf3.add(key);
|
||||
bf3.add(k2);
|
||||
assertEquals(bf3.approximateCount(key), i + 1);
|
||||
assertEquals(bf3.approximateCount(k2), i + 1);
|
||||
}
|
||||
// test gently degraded counting in high-fill, high error rate filter
|
||||
for (int i = 8; i < 15; i++) {
|
||||
bf3.add(key);
|
||||
assertTrue(bf3.approximateCount(key) >= (i + 1));
|
||||
assertEquals(bf3.approximateCount(k2), 8);
|
||||
assertEquals(bf3.approximateCount(k3), 0);
|
||||
}
|
||||
}
|
||||
|
||||
/** Test a DynamicBloomFilter
|
||||
|
|
Loading…
Reference in New Issue