diff --git a/CHANGES.txt b/CHANGES.txt
index f66635fedcf..e5170ab7c9a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -23,6 +23,8 @@ Release 0.19.0 - Unreleased
should emit a sorted list of tables (Krzysztof Szlapinski via Stack)
HBASE-884 Double and float converters for Bytes class
(Doğacan Güney via Stack)
+ HBASE-908 Add approximate counting to CountingBloomFilter
+ (Andrzej Bialecki via Stack)
NEW FEATURES
HBASE-875 Use MurmurHash instead of JenkinsHash [in bloomfilters]
diff --git a/src/java/org/onelab/filter/CountingBloomFilter.java b/src/java/org/onelab/filter/CountingBloomFilter.java
index 9793d172004..00c346431ff 100644
--- a/src/java/org/onelab/filter/CountingBloomFilter.java
+++ b/src/java/org/onelab/filter/CountingBloomFilter.java
@@ -195,6 +195,44 @@ public final class CountingBloomFilter extends Filter {
return true;
}//end membershipTest()
+ /**
+ * This method calculates an approximate count of the key, i.e. how many
+ * times the key was added to the filter. This allows the filter to be
+ * used as an approximate key -> count
map.
+ *
NOTE: due to the bucket size of this filter, inserting the same
+ * key more than 15 times will cause an overflow at all filter positions
+ * associated with this key, and it will significantly increase the error
+ * rate for this and other keys. For this reason the filter can only be
+ * used to store small count values 0 <= N << 15
.
+ * @param key key to be tested
+ * @return 0 if the key is not present. Otherwise, a positive value v will
+ * be returned such that v == count
with probability equal to the
+ * error rate of this filter, and v > count
otherwise.
+ * Additionally, if the filter experienced an underflow as a result of
+ * {@link #delete(Key)} operation, the return value may be lower than the
+ * count
with the probability of the false negative rate of such
+ * filter.
+ */
+ public int approximateCount(Key key) {
+ int res = Integer.MAX_VALUE;
+ int[] h = hash.hash(key);
+ hash.clear();
+ for (int i = 0; i < nbHash; i++) {
+ // find the bucket
+ int wordNum = h[i] >> 4; // div 16
+ int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
+
+ long bucketMask = 15L << bucketShift;
+ long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+ if (bucketValue < res) res = (int)bucketValue;
+ }
+ if (res != Integer.MAX_VALUE) {
+ return res;
+ } else {
+ return 0;
+ }
+ }
+
@Override
public void not(){
throw new UnsupportedOperationException("not() is undefined for "
diff --git a/src/test/org/onelab/test/TestFilter.java b/src/test/org/onelab/test/TestFilter.java
index e39616f0ad0..6c88c1ab33f 100644
--- a/src/test/org/onelab/test/TestFilter.java
+++ b/src/test/org/onelab/test/TestFilter.java
@@ -291,13 +291,28 @@ public class TestFilter extends TestCase {
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
- // to test for overflows, add 'key' enough times to overflow an 8bit bucket,
+ // to test for overflows, add 'key' enough times to overflow a 4bit bucket,
// while asserting that it stays a member
- for(int i = 0; i < 257; i++){
+ for(int i = 0; i < 16; i++){
bf.add(key);
assertTrue(bf.membershipTest(key));
}
-
+ // test approximateCount
+ CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
+ // test the exact range
+ for (int i = 0; i < 8; i++) {
+ bf3.add(key);
+ bf3.add(k2);
+ assertEquals(bf3.approximateCount(key), i + 1);
+ assertEquals(bf3.approximateCount(k2), i + 1);
+ }
+ // test gently degraded counting in high-fill, high error rate filter
+ for (int i = 8; i < 15; i++) {
+ bf3.add(key);
+ assertTrue(bf3.approximateCount(key) >= (i + 1));
+ assertEquals(bf3.approximateCount(k2), 8);
+ assertEquals(bf3.approximateCount(k3), 0);
+ }
}
/** Test a DynamicBloomFilter