diff --git a/CHANGES.txt b/CHANGES.txt index 09b14c7e621..c5f984dd36c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -25,6 +25,8 @@ Trunk (unreleased changes) IMPROVEMENTS HADOOP-1737 Make HColumnDescriptor data publically members settable HADOOP-1746 Clean up findbugs warnings + HADOOP-1757 Bloomfilters: single argument constructor, use enum for bloom + filter types Below are the list of changes before 2007-08-18 diff --git a/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java b/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java index 0d9df68f67b..5d7611ab49f 100644 --- a/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java +++ b/src/java/org/apache/hadoop/hbase/BloomFilterDescriptor.java @@ -27,40 +27,88 @@ import org.apache.hadoop.io.WritableComparable; /** * Supplied as a parameter to HColumnDescriptor to specify what kind of - * bloom filter to use for a column, and its configuration parameters + * bloom filter to use for a column, and its configuration parameters. + * + * There is no way to automatically determine the vector size and the number of + * hash functions to use. In particular, bloom filters are very sensitive to the + * number of elements inserted into them. For HBase, the number of entries + * depends on the size of the data stored in the column. Currently the default + * region size is 64MB, so the number of entries is approximately + * 64MB / (average value size for column). + * + * If m denotes the number of bits in the Bloom filter (vectorSize), + * n denotes the number of elements inserted into the Bloom filter and + * k represents the number of hash functions used (nbHash), then according to + * Broder and Mitzenmacher, + * + * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf ) + * + * the probability of false positives is minimized when k is approximately + * m/n ln(2). + * */ public class BloomFilterDescriptor implements WritableComparable { + private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0; /* * Specify the kind of bloom filter that will be instantiated */ - /** - * Bloom filter, as defined by Bloom in 1970. - */ - public static final int BLOOMFILTER = 1; - - /** - * counting Bloom filter, as defined by Fan et al. in a ToN 2000 paper. - */ - public static final int COUNTING_BLOOMFILTER = 2; - - /** - * retouched Bloom filter, as defined in the CoNEXT 2006 paper. - */ - public static final int RETOUCHED_BLOOMFILTER = 3; + /** The type of bloom filter */ + public static enum BloomFilterType { + /** Bloom filter, as defined by Bloom in 1970. */ + BLOOMFILTER, + /** + * Counting Bloom filter, as defined by Fan et al. in a ToN 2000 paper. + */ + COUNTING_BLOOMFILTER, + /** + * Retouched Bloom filter, as defined in the CoNEXT 2006 paper. + */ + RETOUCHED_BLOOMFILTER + } /** Default constructor - used in conjunction with Writable */ public BloomFilterDescriptor() { super(); } + /** + * Creates a BloomFilterDescriptor for the specified type of filter, fixes + * the number of hash functions to 4 and computes a vector size using: + * + * vectorSize = ceil((4 * n) / ln(2)) + * + * @param type + * @param numberOfEntries + */ + public BloomFilterDescriptor(final BloomFilterType type, + final int numberOfEntries) { + + switch(type) { + case BLOOMFILTER: + case COUNTING_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: + this.filterType = type; + break; + + default: + throw new IllegalArgumentException("Invalid bloom filter type: " + type); + } + this.nbHash = (int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS; + this.vectorSize = (int) Math.ceil( + (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * numberOfEntries)) / + Math.log(2.0)); + } + /** * @param type The kind of bloom filter to use. * @param vectorSize The vector size of this filter. * @param nbHash The number of hash functions to consider. */ - public BloomFilterDescriptor(int type, int vectorSize, int nbHash) { + public BloomFilterDescriptor(final BloomFilterType type, final int vectorSize, + final int nbHash) { + switch(type) { case BLOOMFILTER: case COUNTING_BLOOMFILTER: @@ -75,7 +123,7 @@ public class BloomFilterDescriptor implements WritableComparable { this.nbHash = nbHash; } - int filterType; + BloomFilterType filterType; int vectorSize; int nbHash; @@ -113,7 +161,7 @@ public class BloomFilterDescriptor implements WritableComparable { /** {@inheritDoc} */ @Override public int hashCode() { - int result = Integer.valueOf(this.filterType).hashCode(); + int result = this.filterType.hashCode(); result ^= Integer.valueOf(this.vectorSize).hashCode(); result ^= Integer.valueOf(this.nbHash).hashCode(); return result; @@ -123,14 +171,15 @@ public class BloomFilterDescriptor implements WritableComparable { /** {@inheritDoc} */ public void readFields(DataInput in) throws IOException { - filterType = in.readInt(); + int ordinal = in.readInt(); + this.filterType = BloomFilterType.values()[ordinal]; vectorSize = in.readInt(); nbHash = in.readInt(); } /** {@inheritDoc} */ public void write(DataOutput out) throws IOException { - out.writeInt(filterType); + out.writeInt(filterType.ordinal()); out.writeInt(vectorSize); out.writeInt(nbHash); } @@ -140,7 +189,7 @@ public class BloomFilterDescriptor implements WritableComparable { /** {@inheritDoc} */ public int compareTo(Object o) { BloomFilterDescriptor other = (BloomFilterDescriptor)o; - int result = this.filterType - other.filterType; + int result = this.filterType.ordinal() - other.filterType.ordinal(); if(result == 0) { result = this.vectorSize - other.vectorSize; diff --git a/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java b/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java index 453d9029bc7..3285f02f15f 100644 --- a/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java +++ b/src/java/org/apache/hadoop/hbase/HColumnDescriptor.java @@ -31,6 +31,11 @@ import org.apache.hadoop.io.WritableComparable; /** * An HColumnDescriptor contains information about a column family such as the * number of versions, compression settings, etc. + * + * It is used as input when creating a table or adding a column. Once set, the + * parameters that specify a column cannot be changed without deleting the + * column and recreating it. If there is data stored in the column, it will be + * deleted when the column is deleted. */ public class HColumnDescriptor implements WritableComparable { diff --git a/src/java/org/apache/hadoop/hbase/HStore.java b/src/java/org/apache/hadoop/hbase/HStore.java index 67bc5dd284b..cbdb4481221 100644 --- a/src/java/org/apache/hadoop/hbase/HStore.java +++ b/src/java/org/apache/hadoop/hbase/HStore.java @@ -316,18 +316,21 @@ class HStore implements HConstants { if (LOG.isDebugEnabled()) { LOG.debug("loading bloom filter for " + this.storeName); } - - switch(family.getBloomFilter().filterType) { - case BloomFilterDescriptor.BLOOMFILTER: + BloomFilterDescriptor.BloomFilterType type = + family.getBloomFilter().filterType; + + switch(type) { + + case BLOOMFILTER: bloomFilter = new BloomFilter(); break; - case BloomFilterDescriptor.COUNTING_BLOOMFILTER: + case COUNTING_BLOOMFILTER: bloomFilter = new CountingBloomFilter(); break; - case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: bloomFilter = new RetouchedBloomFilter(); } FSDataInputStream in = fs.open(filterFile); @@ -339,20 +342,23 @@ class HStore implements HConstants { LOG.debug("creating bloom filter for " + this.storeName); } - switch(family.getBloomFilter().filterType) { + BloomFilterDescriptor.BloomFilterType type = + family.getBloomFilter().filterType; + + switch(type) { - case BloomFilterDescriptor.BLOOMFILTER: + case BLOOMFILTER: bloomFilter = new BloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); break; - case BloomFilterDescriptor.COUNTING_BLOOMFILTER: + case COUNTING_BLOOMFILTER: bloomFilter = new CountingBloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); break; - case BloomFilterDescriptor.RETOUCHED_BLOOMFILTER: + case RETOUCHED_BLOOMFILTER: bloomFilter = new RetouchedBloomFilter(family.getBloomFilter().vectorSize, family.getBloomFilter().nbHash); diff --git a/src/test/org/apache/hadoop/hbase/TestBloomFilters.java b/src/test/org/apache/hadoop/hbase/TestBloomFilters.java index 94ebdf3ff73..eced4f85c49 100644 --- a/src/test/org/apache/hadoop/hbase/TestBloomFilters.java +++ b/src/test/org/apache/hadoop/hbase/TestBloomFilters.java @@ -19,18 +19,16 @@ */ package org.apache.hadoop.hbase; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; - +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.Text; /** Tests per-column bloom filters */ public class TestBloomFilters extends HBaseClusterTestCase { + static final Log LOG = LogFactory.getLog(TestBloomFilters.class); + private static final Text CONTENTS = new Text("contents:"); - private HTableDescriptor desc = null; - private HTable table = null; - private static final Text[] rows = { new Text("wmjwjzyv"), new Text("baietibz"), @@ -144,28 +142,40 @@ public class TestBloomFilters extends HBaseClusterTestCase { /** constructor */ public TestBloomFilters() { super(); - conf.set("hbase.hregion.maxunflushed", "90"); // flush cache every 100 writes + conf.set("hbase.hregion.memcache.flush.size", "100");// flush cache every 100 bytes conf.set("hbase.regionserver.maxlogentries", "90"); // and roll log too - Logger.getLogger(HRegion.class).setLevel(Level.DEBUG); - Logger.getLogger(HStore.class).setLevel(Level.DEBUG); } - /** {@inheritDoc} */ - @Override - public void setUp() { + /** Test that specifies explicit parameters for the bloom filter */ + public void testExplicitParameters() { + HTable table = null; try { - super.setUp(); - this.desc = new HTableDescriptor("test"); + // Setup + HTableDescriptor desc = new HTableDescriptor(getName()); + BloomFilterDescriptor bloomFilter = + new BloomFilterDescriptor( // if we insert 1000 values + BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter + 12499, // number of bits + 4 // number of hash functions + ); + desc.addFamily( - new HColumnDescriptor(CONTENTS, 1, HColumnDescriptor.CompressionType.NONE, - false, Integer.MAX_VALUE, - new BloomFilterDescriptor( // if we insert 1000 values - BloomFilterDescriptor.BLOOMFILTER, // plain old bloom filter - 12499, // number of bits - 4 // number of hash functions - ))); // false positive = 0.0000001 + new HColumnDescriptor(CONTENTS, // Column name + 1, // Max versions + HColumnDescriptor.CompressionType.NONE, // no compression + HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory + HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH, + bloomFilter + ) + ); + + // Create the table + HBaseAdmin admin = new HBaseAdmin(conf); admin.createTable(desc); + + // Open table + table = new HTable(conf, desc.getName()); // Store some values @@ -181,10 +191,78 @@ public class TestBloomFilters extends HBaseClusterTestCase { e.printStackTrace(); fail(); } - } + try { + // Give cache flusher and log roller a chance to run + // Otherwise we'll never hit the bloom filter, just the memcache + Thread.sleep(conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000) * 2); + + } catch (InterruptedException e) { + // ignore + } - /** the test */ - public void testBloomFilters() { + + try { + if (table != null) { + for(int i = 0; i < testKeys.length; i++) { + byte[] value = table.get(testKeys[i], CONTENTS); + if(value != null && value.length != 0) { + LOG.info("non existant key: " + testKeys[i] + " returned value: " + + new String(value, HConstants.UTF8_ENCODING)); + } + } + } + } catch (Exception e) { + e.printStackTrace(); + fail(); + } + } + + /** Test that uses computed for the bloom filter */ + public void testComputedParameters() { + HTable table = null; + try { + // Setup + HTableDescriptor desc = new HTableDescriptor(getName()); + + BloomFilterDescriptor bloomFilter = + new BloomFilterDescriptor( + BloomFilterDescriptor.BloomFilterType.BLOOMFILTER, // plain old bloom filter + 1000 // estimated number of entries + ); + LOG.info("vector size: " + bloomFilter.vectorSize); + + desc.addFamily( + new HColumnDescriptor(CONTENTS, // Column name + 1, // Max versions + HColumnDescriptor.CompressionType.NONE, // no compression + HColumnDescriptor.DEFAULT_IN_MEMORY, // not in memory + HColumnDescriptor.DEFAULT_MAX_VALUE_LENGTH, + bloomFilter + ) + ); + + // Create the table + + HBaseAdmin admin = new HBaseAdmin(conf); + admin.createTable(desc); + + // Open table + + table = new HTable(conf, desc.getName()); + + // Store some values + + for(int i = 0; i < 100; i++) { + Text row = rows[i]; + String value = row.toString(); + long lockid = table.startUpdate(rows[i]); + table.put(lockid, CONTENTS, value.getBytes(HConstants.UTF8_ENCODING)); + table.commit(lockid); + } + } catch (Exception e) { + e.printStackTrace(); + fail(); + } try { // Give cache flusher and log roller a chance to run // Otherwise we'll never hit the bloom filter, just the memcache @@ -195,11 +273,13 @@ public class TestBloomFilters extends HBaseClusterTestCase { } try { - for(int i = 0; i < testKeys.length; i++) { - byte[] value = table.get(testKeys[i], CONTENTS); - if(value != null && value.length != 0) { - System.err.println("non existant key: " + testKeys[i] + - " returned value: " + new String(value, HConstants.UTF8_ENCODING)); + if (table != null) { + for(int i = 0; i < testKeys.length; i++) { + byte[] value = table.get(testKeys[i], CONTENTS); + if(value != null && value.length != 0) { + LOG.info("non existant key: " + testKeys[i] + " returned value: " + + new String(value, HConstants.UTF8_ENCODING)); + } } } } catch (Exception e) {