HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom filter types

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@570270 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jim Kellerman 2007-08-27 23:35:15 +00:00
parent e9aafde1f1
commit f56ee6b375
5 changed files with 201 additions and 59 deletions

View File

@ -25,6 +25,8 @@ Trunk (unreleased changes)
IMPROVEMENTS IMPROVEMENTS
HADOOP-1737 Make HColumnDescriptor data publically members settable HADOOP-1737 Make HColumnDescriptor data publically members settable
HADOOP-1746 Clean up findbugs warnings HADOOP-1746 Clean up findbugs warnings
HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom
filter types
Below are the list of changes before 2007-08-18 Below are the list of changes before 2007-08-18

View File

@ -27,40 +27,88 @@ import org.apache.hadoop.io.WritableComparable;
/** /**
* Supplied as a parameter to HColumnDescriptor to specify what kind of * Supplied as a parameter to HColumnDescriptor to specify what kind of
* bloom filter to use for a column, and its configuration parameters * bloom filter to use for a column, and its configuration parameters.
*
* There is no way to automatically determine the vector size and the number of
* hash functions to use. In particular, bloom filters are very sensitive to the
* number of elements inserted into them. For HBase, the number of entries
* depends on the size of the data stored in the column. Currently the default
* region size is 64MB, so the number of entries is approximately
* 64MB / (average value size for column).
*
* If m denotes the number of bits in the Bloom filter (vectorSize),
* n denotes the number of elements inserted into the Bloom filter and
* k represents the number of hash functions used (nbHash), then according to
* Broder and Mitzenmacher,
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
*
* the probability of false positives is minimized when k is approximately
* m/n ln(2).
*
*/ */
public class BloomFilterDescriptor implements WritableComparable { public class BloomFilterDescriptor implements WritableComparable {
private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0;
/* /*
* Specify the kind of bloom filter that will be instantiated * Specify the kind of bloom filter that will be instantiated
*/ */
/** /** The type of bloom filter */
* <i>Bloom filter</i>, as defined by Bloom in 1970. public static enum BloomFilterType {
*/ /** <i>Bloom filter</i>, as defined by Bloom in 1970. */
public static final int BLOOMFILTER = 1; BLOOMFILTER,
/**
/** * <i>Counting Bloom filter</i>, as defined by Fan et al. in a ToN 2000 paper.
* <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN 2000 paper. */
*/ COUNTING_BLOOMFILTER,
public static final int COUNTING_BLOOMFILTER = 2; /**
* <i>Retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
/** */
* <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper. RETOUCHED_BLOOMFILTER
*/ }
public static final int RETOUCHED_BLOOMFILTER = 3;
/** Default constructor - used in conjunction with Writable */ /** Default constructor - used in conjunction with Writable */
public BloomFilterDescriptor() { public BloomFilterDescriptor() {
super(); super();
} }
/**
* Creates a BloomFilterDescriptor for the specified type of filter, fixes
* the number of hash functions to 4 and computes a vector size using:
*
* vectorSize = ceil((4 * n) / ln(2))
*
* @param type
* @param numberOfEntries
*/
public BloomFilterDescriptor(final BloomFilterType type,
final int numberOfEntries) {
switch(type) {
case BLOOMFILTER:
case COUNTING_BLOOMFILTER:
case RETOUCHED_BLOOMFILTER:
this.filterType = type;
break;
default:
throw new IllegalArgumentException("Invalid bloom filter type: " + type);
}
this.nbHash = (int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS;
this.vectorSize = (int) Math.ceil(
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * numberOfEntries)) /
Math.log(2.0));
}
/** /**
* @param type The kind of bloom filter to use. * @param type The kind of bloom filter to use.
* @param vectorSize The vector size of <i>this</i> filter. * @param vectorSize The vector size of <i>this</i> filter.
* @param nbHash The number of hash functions to consider. * @param nbHash The number of hash functions to consider.
*/ */
public BloomFilterDescriptor(int type, int vectorSize, int nbHash) { public BloomFilterDescriptor(final BloomFilterType type, final int vectorSize,
final int nbHash) {
switch(type) { switch(type) {
case BLOOMFILTER: case BLOOMFILTER:
case COUNTING_BLOOMFILTER: case COUNTING_BLOOMFILTER:
@ -75,7 +123,7 @@ public class BloomFilterDescriptor implements WritableComparable {
this.nbHash = nbHash; this.nbHash = nbHash;
} }
int filterType; BloomFilterType filterType;
int vectorSize; int vectorSize;
int nbHash; int nbHash;
@ -113,7 +161,7 @@ public class BloomFilterDescriptor implements WritableComparable {
/** {@inheritDoc} */ /** {@inheritDoc} */
@Override @Override
public int hashCode() { public int hashCode() {
int result = Integer.valueOf(this.filterType).hashCode(); int result = this.filterType.hashCode();
result ^= Integer.valueOf(this.vectorSize).hashCode(); result ^= Integer.valueOf(this.vectorSize).hashCode();
result ^= Integer.valueOf(this.nbHash).hashCode(); result ^= Integer.valueOf(this.nbHash).hashCode();
return result; return result;
@ -123,14 +171,15 @@ public class BloomFilterDescriptor implements WritableComparable {
/** {@inheritDoc} */ /** {@inheritDoc} */
public void readFields(DataInput in) throws IOException { public void readFields(DataInput in) throws IOException {
filterType = in.readInt(); int ordinal = in.readInt();
this.filterType = BloomFilterType.values()[ordinal];
vectorSize = in.readInt(); vectorSize = in.readInt();
nbHash = in.readInt(); nbHash = in.readInt();
} }
/** {@inheritDoc} */ /** {@inheritDoc} */
public void write(DataOutput out) throws IOException { public void write(DataOutput out) throws IOException {
out.writeInt(filterType); out.writeInt(filterType.ordinal());
out.writeInt(vectorSize); out.writeInt(vectorSize);
out.writeInt(nbHash); out.writeInt(nbHash);
} }
@ -140,7 +189,7 @@ public class BloomFilterDescriptor implements WritableComparable {
/** {@inheritDoc} */ /** {@inheritDoc} */
public int compareTo(Object o) { public int compareTo(Object o) {
BloomFilterDescriptor other = (BloomFilterDescriptor)o; BloomFilterDescriptor other = (BloomFilterDescriptor)o;
int result = this.filterType - other.filterType; int result = this.filterType.ordinal() - other.filterType.ordinal();
if(result == 0) { if(result == 0) {
result = this.vectorSize - other.vectorSize; result = this.vectorSize - other.vectorSize;

View File

@ -31,6 +31,11 @@ import org.apache.hadoop.io.WritableComparable;
/** /**
* An HColumnDescriptor contains information about a column family such as the * An HColumnDescriptor contains information about a column family such as the
* number of versions, compression settings, etc. * number of versions, compression settings, etc.
*
* It is used as input when creating a table or adding a column. Once set, the
* parameters that specify a column cannot be changed without deleting the
* column and recreating it. If there is data stored in the column, it will be
* deleted when the column is deleted.
*/ */
public class HColumnDescriptor implements WritableComparable { public class HColumnDescriptor implements WritableComparable {

View File

@ -316,18 +316,21 @@ class HStore implements HConstants {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("loading bloom filter for " + this.storeName); LOG.debug("loading bloom filter for " + this.storeName);
} }
switch(family.getBloomFilter().filterType) {
case BloomFilterDescriptor.BLOOMFILTER: BloomFilterDescriptor.BloomFilterType type =
family.getBloomFilter().filterType;
switch(type) {
case BLOOMFILTER:
bloomFilter = new BloomFilter(); bloomFilter = new BloomFilter();
break; break;
case BloomFilterDescriptor.COUNTING_BLOOMFILTER: case COUNTING_BLOOMFILTER:
bloomFilter = new CountingBloomFilter(); bloomFilter = new CountingBloomFilter();
break; break;
case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: case RETOUCHED_BLOOMFILTER:
bloomFilter = new RetouchedBloomFilter(); bloomFilter = new RetouchedBloomFilter();
} }
FSDataInputStream in = fs.open(filterFile); FSDataInputStream in = fs.open(filterFile);
@ -339,20 +342,23 @@ class HStore implements HConstants {
LOG.debug("creating bloom filter for " + this.storeName); LOG.debug("creating bloom filter for " + this.storeName);
} }
switch(family.getBloomFilter().filterType) { BloomFilterDescriptor.BloomFilterType type =
family.getBloomFilter().filterType;
switch(type) {
case BloomFilterDescriptor.BLOOMFILTER: case BLOOMFILTER:
bloomFilter = new BloomFilter(family.getBloomFilter().vectorSize, bloomFilter = new BloomFilter(family.getBloomFilter().vectorSize,
family.getBloomFilter().nbHash); family.getBloomFilter().nbHash);
break; break;
case BloomFilterDescriptor.COUNTING_BLOOMFILTER: case COUNTING_BLOOMFILTER:
bloomFilter = bloomFilter =
new CountingBloomFilter(family.getBloomFilter().vectorSize, new CountingBloomFilter(family.getBloomFilter().vectorSize,
family.getBloomFilter().nbHash); family.getBloomFilter().nbHash);
break; break;
case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: case RETOUCHED_BLOOMFILTER:
bloomFilter = bloomFilter =
new RetouchedBloomFilter(family.getBloomFilter().vectorSize, new RetouchedBloomFilter(family.getBloomFilter().vectorSize,
family.getBloomFilter().nbHash); family.getBloomFilter().nbHash);

View File

@ -19,18 +19,16 @@
*/ */
package org.apache.hadoop.hbase; package org.apache.hadoop.hbase;
import org.apache.log4j.Level; import org.apache.commons.logging.Log;
import org.apache.log4j.Logger; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
/** Tests per-column bloom filters */ /** Tests per-column bloom filters */
public class TestBloomFilters extends HBaseClusterTestCase { public class TestBloomFilters extends HBaseClusterTestCase {
static final Log LOG = LogFactory.getLog(TestBloomFilters.class);
private static final Text CONTENTS = new Text("contents:"); private static final Text CONTENTS = new Text("contents:");
private HTableDescriptor desc = null;
private HTable table = null;
private static final Text[] rows = { private static final Text[] rows = {
new Text("wmjwjzyv"), new Text("wmjwjzyv"),
new Text("baietibz"), new Text("baietibz"),
@ -144,28 +142,40 @@ public class TestBloomFilters extends HBaseClusterTestCase {
/** constructor */ /** constructor */
public TestBloomFilters() { public TestBloomFilters() {
super(); super();
conf.set("hbase.hregion.maxunflushed", "90"); // flush cache every 100 writes conf.set("hbase.hregion.memcache.flush.size", "100");// flush cache every 100 bytes
conf.set("hbase.regionserver.maxlogentries", "90"); // and roll log too conf.set("hbase.regionserver.maxlogentries", "90"); // and roll log too
Logger.getLogger(HRegion.class).setLevel(Level.DEBUG);
Logger.getLogger(HStore.class).setLevel(Level.DEBUG);
} }
/** {@inheritDoc} */ /** Test that specifies explicit parameters for the bloom filter */
@Override public void testExplicitParameters() {
public void setUp() { HTable table = null;
try { try {
super.setUp(); // Setup
this.desc = new HTableDescriptor("test"); HTableDescriptor desc = new HTableDescriptor(getName());
BloomFilterDescriptor bloomFilter =
new BloomFilterDescriptor( // if we insert 1000 values
BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter
12499, // number of bits
4 // number of hash functions
);
desc.addFamily( desc.addFamily(
new HColumnDescriptor(CONTENTS, 1, HColumnDescriptor.CompressionType.NONE, new HColumnDescriptor(CONTENTS, // Column name
false, Integer.MAX_VALUE, 1, // Max versions
new BloomFilterDescriptor( // if we insert 1000 values HColumnDescriptor.CompressionType.NONE, // no compression
BloomFilterDescriptor.BLOOMFILTER, // plain old bloom filter HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory
12499, // number of bits HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH,
4 // number of hash functions bloomFilter
))); // false positive = 0.0000001 )
);
// Create the table
HBaseAdmin admin = new HBaseAdmin(conf); HBaseAdmin admin = new HBaseAdmin(conf);
admin.createTable(desc); admin.createTable(desc);
// Open table
table = new HTable(conf, desc.getName()); table = new HTable(conf, desc.getName());
// Store some values // Store some values
@ -181,10 +191,78 @@ public class TestBloomFilters extends HBaseClusterTestCase {
e.printStackTrace(); e.printStackTrace();
fail(); fail();
} }
} try {
// Give cache flusher and log roller a chance to run
// Otherwise we'll never hit the bloom filter, just the memcache
Thread.sleep(conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000) * 2);
} catch (InterruptedException e) {
// ignore
}
/** the test */
public void testBloomFilters() { try {
if (table != null) {
for(int i = 0; i < testKeys.length; i++) {
byte[] value = table.get(testKeys[i], CONTENTS);
if(value != null && value.length != 0) {
LOG.info("non existant key: " + testKeys[i] + " returned value: " +
new String(value, HConstants.UTF8_ENCODING));
}
}
}
} catch (Exception e) {
e.printStackTrace();
fail();
}
}
/** Test that uses computed for the bloom filter */
public void testComputedParameters() {
HTable table = null;
try {
// Setup
HTableDescriptor desc = new HTableDescriptor(getName());
BloomFilterDescriptor bloomFilter =
new BloomFilterDescriptor(
BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter
1000 // estimated number of entries
);
LOG.info("vector size: " + bloomFilter.vectorSize);
desc.addFamily(
new HColumnDescriptor(CONTENTS, // Column name
1, // Max versions
HColumnDescriptor.CompressionType.NONE, // no compression
HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory
HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH,
bloomFilter
)
);
// Create the table
HBaseAdmin admin = new HBaseAdmin(conf);
admin.createTable(desc);
// Open table
table = new HTable(conf, desc.getName());
// Store some values
for(int i = 0; i < 100; i++) {
Text row = rows[i];
String value = row.toString();
long lockid = table.startUpdate(rows[i]);
table.put(lockid, CONTENTS, value.getBytes(HConstants.UTF8_ENCODING));
table.commit(lockid);
}
} catch (Exception e) {
e.printStackTrace();
fail();
}
try { try {
// Give cache flusher and log roller a chance to run // Give cache flusher and log roller a chance to run
// Otherwise we'll never hit the bloom filter, just the memcache // Otherwise we'll never hit the bloom filter, just the memcache
@ -195,11 +273,13 @@ public class TestBloomFilters extends HBaseClusterTestCase {
} }
try { try {
for(int i = 0; i < testKeys.length; i++) { if (table != null) {
byte[] value = table.get(testKeys[i], CONTENTS); for(int i = 0; i < testKeys.length; i++) {
if(value != null && value.length != 0) { byte[] value = table.get(testKeys[i], CONTENTS);
System.err.println("non existant key: " + testKeys[i] + if(value != null && value.length != 0) {
" returned value: " + new String(value, HConstants.UTF8_ENCODING)); LOG.info("non existant key: " + testKeys[i] + " returned value: " +
new String(value, HConstants.UTF8_ENCODING));
}
} }
} }
} catch (Exception e) { } catch (Exception e) {