HBASE-19 CountingBloomFilter can overflow its storage

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@619620 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2008-02-07 20:10:40 +00:00
parent bfce24ce25
commit 51707644ed
3 changed files with 90 additions and 20 deletions

View File

@ -9,6 +9,8 @@ HBase Change Log
OPTIMIZATIONS
BUG FIXES
HBASE-19 CountingBloomFilter can overflow its storage
(Stu Hood and Bryan Duxbury via Stack)
IMPROVEMENTS
HBASE-415 Rewrite leases to use DelayedBlockingQueue instead of polling

View File

@ -50,6 +50,7 @@ package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
/**
* Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
@ -61,15 +62,18 @@ import java.io.IOException;
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 5 Feb. 07
* @version 1.1 - 19 Jan. 08
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
*/
public final class CountingBloomFilter extends Filter {
/** Counter vector. */
private byte[] vector;
/** Storage for the counting buckets */
private long[] buckets;
/** We are using 4bit buckets, so each bucket can count to 15 */
private final static long BUCKET_MAX_VALUE = 15;
/** Default constructor - use with readFields */
public CountingBloomFilter() {}
@ -81,9 +85,15 @@ public final class CountingBloomFilter extends Filter {
*/
public CountingBloomFilter(int vectorSize, int nbHash){
super(vectorSize, nbHash);
vector = new byte[vectorSize];
buckets = new long[buckets2words(vectorSize)];
}//end constructor
/** returns the number of 64 bit words it would take to hold vectorSize buckets */
private static int buckets2words(int vectorSize) {
return ((vectorSize - 1) >>> 4) + 1;
}
/** {@inheritDoc} */
@Override
public void add(Key key) {
@ -95,7 +105,18 @@ public final class CountingBloomFilter extends Filter {
hash.clear();
for(int i = 0; i < nbHash; i++) {
vector[h[i]]++;
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
// only increment if the count in the bucket is less than BUCKET_MAX_VALUE
if(bucketValue < BUCKET_MAX_VALUE) {
// increment by 1
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
}
}
}//end add()
@ -117,8 +138,17 @@ public final class CountingBloomFilter extends Filter {
hash.clear();
for(int i = 0; i < nbHash; i++) {
if(vector[h[i]] >= 1) {
vector[h[i]]--;
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
// only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
// decrement by 1
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
}
}
}//end delete
@ -133,9 +163,10 @@ public final class CountingBloomFilter extends Filter {
throw new IllegalArgumentException("filters cannot be and-ed");
}
CountingBloomFilter cbf = (CountingBloomFilter)filter;
for(int i = 0; i < vectorSize; i++) {
this.vector[i] &= cbf.vector[i];
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
this.buckets[i] &= cbf.buckets[i];
}
}//end and()
@ -150,7 +181,13 @@ public final class CountingBloomFilter extends Filter {
hash.clear();
for(int i = 0; i < nbHash; i++) {
if(vector[h[i]] == 0) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
if((buckets[wordNum] & bucketMask) == 0) {
return false;
}
}
@ -177,8 +214,9 @@ public final class CountingBloomFilter extends Filter {
CountingBloomFilter cbf = (CountingBloomFilter)filter;
for(int i = 0; i < vectorSize; i++) {
this.vector[i] |= cbf.vector[i];
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
this.buckets[i] |= cbf.buckets[i];
}
}//end or()
@ -199,7 +237,14 @@ public final class CountingBloomFilter extends Filter {
if(i > 0) {
res.append(" ");
}
res.append(vector[i]&0xff);
int wordNum = i >> 4; // div 16
int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
res.append(bucketValue);
}
return res.toString();
@ -209,7 +254,7 @@ public final class CountingBloomFilter extends Filter {
@Override
public Object clone(){
CountingBloomFilter cbf = new CountingBloomFilter(vectorSize, nbHash);
cbf.or(this);
cbf.buckets = this.buckets.clone();
return cbf;
}//end clone()
@ -219,8 +264,9 @@ public final class CountingBloomFilter extends Filter {
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
for(int i = 0; i < vector.length; i++) {
out.writeByte(vector[i]);
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
out.writeLong(buckets[i]);
}
}
@ -228,9 +274,10 @@ public final class CountingBloomFilter extends Filter {
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
vector = new byte[vectorSize];
for(int i = 0; i < vector.length; i++) {
vector[i] = in.readByte();
int sizeInWords = buckets2words(vectorSize);
buckets = new long[sizeInWords];
for(int i = 0; i < sizeInWords; i++) {
buckets[i] = in.readLong();
}
}
}//end class

View File

@ -94,6 +94,27 @@ public class TestFilter extends TestCase {
assertTrue(bf.membershipTest(new StringKey("graknyl")));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
// delete 'key', and check that it is no longer a member
((CountingBloomFilter)bf).delete(key);
assertFalse(bf.membershipTest(key));
// OR 'key' back into the filter
Filter bf2 = new CountingBloomFilter(8, 2);
bf2.add(key);
bf.or(bf2);
assertTrue(bf.membershipTest(key));
assertTrue(bf.membershipTest(new StringKey("graknyl")));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
// to test for overflows, add 'key' enough times to overflow an 8bit bucket,
// while asserting that it stays a member
for(int i = 0; i < 257; i++){
bf.add(key);
assertTrue(bf.membershipTest(key));
}
}
/** Test a DynamicBloomFilter