HBASE-908 Add approximate counting to CountingBloomFilter

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@700631 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2008-09-30 23:34:05 +00:00
parent 5ae403ea35
commit b45191d2d9
3 changed files with 58 additions and 3 deletions

View File

@ -23,6 +23,8 @@ Release 0.19.0 - Unreleased
should emit a sorted list of tables (Krzysztof Szlapinski via Stack)
HBASE-884 Double and float converters for Bytes class
(Doğacan Güney via Stack)
HBASE-908 Add approximate counting to CountingBloomFilter
(Andrzej Bialecki via Stack)
NEW FEATURES
HBASE-875 Use MurmurHash instead of JenkinsHash [in bloomfilters]

View File

@ -195,6 +195,44 @@ public final class CountingBloomFilter extends Filter {
return true;
}//end membershipTest()
/**
* This method calculates an approximate count of the key, i.e. how many
* times the key was added to the filter. This allows the filter to be
* used as an approximate <code>key -&gt; count</code> map.
* <p>NOTE: due to the bucket size of this filter, inserting the same
* key more than 15 times will cause an overflow at all filter positions
* associated with this key, and it will significantly increase the error
* rate for this and other keys. For this reason the filter can only be
* used to store small count values <code>0 &lt;= N &lt;&lt; 15</code>.
* @param key key to be tested
* @return 0 if the key is not present. Otherwise, a positive value v will
* be returned such that <code>v == count</code> with probability equal to the
* error rate of this filter, and <code>v &gt; count</code> otherwise.
* Additionally, if the filter experienced an underflow as a result of
* {@link #delete(Key)} operation, the return value may be lower than the
* <code>count</code> with the probability of the false negative rate of such
* filter.
*/
public int approximateCount(Key key) {
int res = Integer.MAX_VALUE;
int[] h = hash.hash(key);
hash.clear();
for (int i = 0; i < nbHash; i++) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
if (bucketValue < res) res = (int)bucketValue;
}
if (res != Integer.MAX_VALUE) {
return res;
} else {
return 0;
}
}
@Override
public void not(){
throw new UnsupportedOperationException("not() is undefined for "

View File

@ -291,13 +291,28 @@ public class TestFilter extends TestCase {
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
// to test for overflows, add 'key' enough times to overflow an 8bit bucket,
// to test for overflows, add 'key' enough times to overflow a 4bit bucket,
// while asserting that it stays a member
for(int i = 0; i < 257; i++){
for(int i = 0; i < 16; i++){
bf.add(key);
assertTrue(bf.membershipTest(key));
}
// test approximateCount
CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
// test the exact range
for (int i = 0; i < 8; i++) {
bf3.add(key);
bf3.add(k2);
assertEquals(bf3.approximateCount(key), i + 1);
assertEquals(bf3.approximateCount(k2), i + 1);
}
// test gently degraded counting in high-fill, high error rate filter
for (int i = 8; i < 15; i++) {
bf3.add(key);
assertTrue(bf3.approximateCount(key) >= (i + 1));
assertEquals(bf3.approximateCount(k2), 8);
assertEquals(bf3.approximateCount(k3), 0);
}
}
/** Test a DynamicBloomFilter