HBASE-908 Add approximate counting to CountingBloomFilter

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@700631 13f79535-47bb-0310-9956-ffa450edef68
2008-09-30 23:34:05 +00:00 · 2008-09-30 23:34:05 +00:00 · b45191d2d9
parent 5ae403ea35
commit b45191d2d9
3 changed files with 58 additions and 3 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -23,6 +23,8 @@ Release 0.19.0 - Unreleased
               should emit a sorted list of tables (Krzysztof Szlapinski via Stack)
   HBASE-884   Double and float converters for Bytes class
               (Doğacan Güney via Stack)
+   HBASE-908   Add approximate counting to CountingBloomFilter
+               (Andrzej Bialecki via Stack)

  NEW FEATURES
   HBASE-875   Use MurmurHash instead of JenkinsHash [in bloomfilters]
--- a/src/java/org/onelab/filter/CountingBloomFilter.java
+++ b/src/java/org/onelab/filter/CountingBloomFilter.java
@ -195,6 +195,44 @@ public final class CountingBloomFilter extends Filter {
    return true;
  }//end membershipTest()

+  /**
+   * This method calculates an approximate count of the key, i.e. how many
+   * times the key was added to the filter. This allows the filter to be
+   * used as an approximate <code>key -&gt; count</code> map.
+   * <p>NOTE: due to the bucket size of this filter, inserting the same
+   * key more than 15 times will cause an overflow at all filter positions
+   * associated with this key, and it will significantly increase the error
+   * rate for this and other keys. For this reason the filter can only be
+   * used to store small count values <code>0 &lt;= N &lt;&lt; 15</code>.
+   * @param key key to be tested
+   * @return 0 if the key is not present. Otherwise, a positive value v will
+   * be returned such that <code>v == count</code> with probability equal to the
+   * error rate of this filter, and <code>v &gt; count</code> otherwise.
+   * Additionally, if the filter experienced an underflow as a result of
+   * {@link #delete(Key)} operation, the return value may be lower than the
+   * <code>count</code> with the probability of the false negative rate of such
+   * filter.
+   */
+  public int approximateCount(Key key) {
+    int res = Integer.MAX_VALUE;
+    int[] h = hash.hash(key);
+    hash.clear();
+    for (int i = 0; i < nbHash; i++) {
+      // find the bucket
+      int wordNum = h[i] >> 4;          // div 16
+      int bucketShift = (h[i] & 0x0f) << 2;  // (mod 16) * 4
+      
+      long bucketMask = 15L << bucketShift;
+      long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
+      if (bucketValue < res) res = (int)bucketValue;
+    }
+    if (res != Integer.MAX_VALUE) {
+      return res;
+    } else {
+      return 0;
+    }
+  }
+
  @Override
  public void not(){
    throw new UnsupportedOperationException("not() is undefined for "
--- a/src/test/org/onelab/test/TestFilter.java
+++ b/src/test/org/onelab/test/TestFilter.java
@ -291,13 +291,28 @@ public class TestFilter extends TestCase {
    assertFalse(bf.membershipTest(new StringKey("xyzzy")));
    assertFalse(bf.membershipTest(new StringKey("abcd")));
    
-    // to test for overflows, add 'key' enough times to overflow an 8bit bucket,
+    // to test for overflows, add 'key' enough times to overflow a 4bit bucket,
    // while asserting that it stays a member
-    for(int i = 0; i < 257; i++){
+    for(int i = 0; i < 16; i++){
      bf.add(key);
      assertTrue(bf.membershipTest(key));
    }
-    
+    // test approximateCount
+    CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
+    // test the exact range
+    for (int i = 0; i < 8; i++) {
+      bf3.add(key);
+      bf3.add(k2);
+      assertEquals(bf3.approximateCount(key), i + 1);
+      assertEquals(bf3.approximateCount(k2), i + 1);
+    }
+    // test gently degraded counting in high-fill, high error rate filter
+    for (int i = 8; i < 15; i++) {
+      bf3.add(key);
+      assertTrue(bf3.approximateCount(key) >= (i + 1));
+      assertEquals(bf3.approximateCount(k2), 8);
+      assertEquals(bf3.approximateCount(k3), 0);
+    }
  }
  
  /** Test a DynamicBloomFilter