diff --git a/CHANGES.txt b/CHANGES.txt index f66635fedcf..e5170ab7c9a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -23,6 +23,8 @@ Release 0.19.0 - Unreleased should emit a sorted list of tables (Krzysztof Szlapinski via Stack) HBASE-884 Double and float converters for Bytes class (Doğacan Güney via Stack) + HBASE-908 Add approximate counting to CountingBloomFilter + (Andrzej Bialecki via Stack) NEW FEATURES HBASE-875 Use MurmurHash instead of JenkinsHash [in bloomfilters] diff --git a/src/java/org/onelab/filter/CountingBloomFilter.java b/src/java/org/onelab/filter/CountingBloomFilter.java index 9793d172004..00c346431ff 100644 --- a/src/java/org/onelab/filter/CountingBloomFilter.java +++ b/src/java/org/onelab/filter/CountingBloomFilter.java @@ -195,6 +195,44 @@ public final class CountingBloomFilter extends Filter { return true; }//end membershipTest() + /** + * This method calculates an approximate count of the key, i.e. how many + * times the key was added to the filter. This allows the filter to be + * used as an approximate key -> count map. + *

NOTE: due to the bucket size of this filter, inserting the same + * key more than 15 times will cause an overflow at all filter positions + * associated with this key, and it will significantly increase the error + * rate for this and other keys. For this reason the filter can only be + * used to store small count values 0 <= N << 15. + * @param key key to be tested + * @return 0 if the key is not present. Otherwise, a positive value v will + * be returned such that v == count with probability equal to the + * error rate of this filter, and v > count otherwise. + * Additionally, if the filter experienced an underflow as a result of + * {@link #delete(Key)} operation, the return value may be lower than the + * count with the probability of the false negative rate of such + * filter. + */ + public int approximateCount(Key key) { + int res = Integer.MAX_VALUE; + int[] h = hash.hash(key); + hash.clear(); + for (int i = 0; i < nbHash; i++) { + // find the bucket + int wordNum = h[i] >> 4; // div 16 + int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 + + long bucketMask = 15L << bucketShift; + long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; + if (bucketValue < res) res = (int)bucketValue; + } + if (res != Integer.MAX_VALUE) { + return res; + } else { + return 0; + } + } + @Override public void not(){ throw new UnsupportedOperationException("not() is undefined for " diff --git a/src/test/org/onelab/test/TestFilter.java b/src/test/org/onelab/test/TestFilter.java index e39616f0ad0..6c88c1ab33f 100644 --- a/src/test/org/onelab/test/TestFilter.java +++ b/src/test/org/onelab/test/TestFilter.java @@ -291,13 +291,28 @@ public class TestFilter extends TestCase { assertFalse(bf.membershipTest(new StringKey("xyzzy"))); assertFalse(bf.membershipTest(new StringKey("abcd"))); - // to test for overflows, add 'key' enough times to overflow an 8bit bucket, + // to test for overflows, add 'key' enough times to overflow a 4bit bucket, // while asserting that it stays a member - for(int i = 0; i < 257; i++){ + for(int i = 0; i < 16; i++){ bf.add(key); assertTrue(bf.membershipTest(key)); } - + // test approximateCount + CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH); + // test the exact range + for (int i = 0; i < 8; i++) { + bf3.add(key); + bf3.add(k2); + assertEquals(bf3.approximateCount(key), i + 1); + assertEquals(bf3.approximateCount(k2), i + 1); + } + // test gently degraded counting in high-fill, high error rate filter + for (int i = 8; i < 15; i++) { + bf3.add(key); + assertTrue(bf3.approximateCount(key) >= (i + 1)); + assertEquals(bf3.approximateCount(k2), 8); + assertEquals(bf3.approximateCount(k3), 0); + } } /** Test a DynamicBloomFilter