From f9fda11ec057f27b8406384b9ef617eca6f21048 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Wed, 27 Oct 2010 04:15:26 +0000 Subject: [PATCH] HBASE-3158 Bloom File Writes Broken if keySize is large git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1027810 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 + .../hadoop/hbase/regionserver/StoreFile.java | 40 +++++++-- .../hadoop/hbase/util/ByteBloomFilter.java | 34 ++++---- .../hbase/regionserver/TestStoreFile.java | 81 +++++++++++++++---- 4 files changed, 117 insertions(+), 40 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index dd1c1234ea1..3fac751a712 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -615,6 +615,8 @@ Release 0.21.0 - Unreleased HBASE-2753 Remove sorted() methods from Result now that Gets are Scans HBASE-3147 Regions stuck in transition after rolling restart, perpetual timeout handling but nothing happens + HBASE-3158 Bloom File Writes Broken if keySize is large + (Nicolas Spiegelberg via Stack) IMPROVEMENTS HBASE-1760 Cleanup TODOs in HTable diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java b/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java index 1af1387a685..bc92edc7f21 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/StoreFile.java @@ -83,6 +83,7 @@ public class StoreFile { // Config keys. static final String IO_STOREFILE_BLOOM_ERROR_RATE = "io.storefile.bloom.error.rate"; static final String IO_STOREFILE_BLOOM_MAX_FOLD = "io.storefile.bloom.max.fold"; + static final String IO_STOREFILE_BLOOM_MAX_KEYS = "io.storefile.bloom.max.keys"; static final String IO_STOREFILE_BLOOM_ENABLED = "io.storefile.bloom.enabled"; static final String HFILE_BLOCK_CACHE_SIZE_KEY = "hfile.block.cache.size"; @@ -691,6 +692,9 @@ public class StoreFile { this.kvComparator = comparator; + BloomFilter bloom = null; + BloomType bt = BloomType.NONE; + if (bloomType != BloomType.NONE && conf != null) { float err = conf.getFloat(IO_STOREFILE_BLOOM_ERROR_RATE, (float)0.01); // Since in row+col blooms we have 2 calls to shouldSeek() instead of 1 @@ -701,15 +705,31 @@ public class StoreFile { err /= 2; } int maxFold = conf.getInt(IO_STOREFILE_BLOOM_MAX_FOLD, 7); - - this.bloomFilter = new ByteBloomFilter(maxKeys, err, - Hash.getHashType(conf), maxFold); - this.bloomFilter.allocBloom(); - this.bloomType = bloomType; - } else { - this.bloomFilter = null; - this.bloomType = BloomType.NONE; + int tooBig = conf.getInt(IO_STOREFILE_BLOOM_MAX_KEYS, 128*1000*1000); + + if (maxKeys < tooBig) { + try { + bloom = new ByteBloomFilter(maxKeys, err, + Hash.getHashType(conf), maxFold); + bloom.allocBloom(); + bt = bloomType; + } catch (IllegalArgumentException iae) { + LOG.warn(String.format( + "Parse error while creating bloom for %s (%d, %f)", + path, maxKeys, err), iae); + bloom = null; + bt = BloomType.NONE; + } + } else { + if (LOG.isDebugEnabled()) { + LOG.debug("Skipping bloom filter because max keysize too large: " + + maxKeys); + } + } } + + this.bloomFilter = bloom; + this.bloomType = bt; } /** @@ -820,6 +840,10 @@ public class StoreFile { public Path getPath() { return this.writer.getPath(); } + + boolean hasBloom() { + return this.bloomFilter != null; + } public void append(final byte [] key, final byte [] value) throws IOException { if (this.bloomFilter != null) { diff --git a/src/main/java/org/apache/hadoop/hbase/util/ByteBloomFilter.java b/src/main/java/org/apache/hadoop/hbase/util/ByteBloomFilter.java index 35190f08926..76828347159 100644 --- a/src/main/java/org/apache/hadoop/hbase/util/ByteBloomFilter.java +++ b/src/main/java/org/apache/hadoop/hbase/util/ByteBloomFilter.java @@ -52,7 +52,7 @@ public class ByteBloomFilter implements BloomFilter { public static final int VERSION = 1; /** Bytes (B) in the array */ - protected int byteSize; + protected long byteSize; /** Number of hash functions */ protected final int hashCount; /** Hash type */ @@ -134,11 +134,11 @@ public class ByteBloomFilter implements BloomFilter { * * The probability of false positives is minimized when k = m/n ln(2). */ - int bitSize = (int)Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185))); + long bitSize = (long)Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185))); int functionCount = (int)Math.ceil(Math.log(2) * (bitSize / maxKeys)); // increase byteSize so folding is possible - int byteSize = (bitSize + 7) / 8; + long byteSize = (bitSize + 7) / 8; int mask = (1 << foldFactor) - 1; if ( (mask & byteSize) != 0) { byteSize >>= foldFactor; @@ -161,13 +161,13 @@ public class ByteBloomFilter implements BloomFilter { if (this.bloom != null) { throw new IllegalArgumentException("can only create bloom once."); } - this.bloom = ByteBuffer.allocate(this.byteSize); + this.bloom = ByteBuffer.allocate((int)this.byteSize); assert this.bloom.hasArray(); } void sanityCheck() throws IllegalArgumentException { - if(this.byteSize <= 0) { - throw new IllegalArgumentException("maxValue must be > 0"); + if(0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize); } if(this.hashCount <= 0) { @@ -205,7 +205,7 @@ public class ByteBloomFilter implements BloomFilter { int hash2 = this.hash.hash(buf, offset, len, hash1); for (int i = 0; i < this.hashCount; i++) { - int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); + long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); set(hashLoc); } @@ -243,7 +243,7 @@ public class ByteBloomFilter implements BloomFilter { int hash2 = this.hash.hash(buf, offset, length, hash1); for (int i = 0; i < this.hashCount; i++) { - int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); + long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8)); if (!get(hashLoc, theBloom) ) { return false; } @@ -259,9 +259,9 @@ public class ByteBloomFilter implements BloomFilter { * * @param pos index of bit */ - void set(int pos) { - int bytePos = pos / 8; - int bitPos = pos % 8; + void set(long pos) { + int bytePos = (int)(pos / 8); + int bitPos = (int)(pos % 8); byte curByte = bloom.get(bytePos); curByte |= bitvals[bitPos]; bloom.put(bytePos, curByte); @@ -273,9 +273,9 @@ public class ByteBloomFilter implements BloomFilter { * @param pos index of bit * @return true if bit at specified index is 1, false if 0. */ - static boolean get(int pos, ByteBuffer theBloom) { - int bytePos = pos / 8; - int bitPos = pos % 8; + static boolean get(long pos, ByteBuffer theBloom) { + int bytePos = (int)(pos / 8); + int bitPos = (int)(pos % 8); byte curByte = theBloom.get(bytePos); curByte &= bitvals[bitPos]; return (curByte != 0); @@ -293,7 +293,7 @@ public class ByteBloomFilter implements BloomFilter { @Override public int getByteSize() { - return this.byteSize; + return (int)this.byteSize; } @Override @@ -301,7 +301,7 @@ public class ByteBloomFilter implements BloomFilter { // see if the actual size is exponentially smaller than expected. if (this.keyCount > 0 && this.bloom.hasArray()) { int pieces = 1; - int newByteSize = this.byteSize; + int newByteSize = (int)this.byteSize; int newMaxKeys = this.maxKeys; // while exponentially smaller & folding is lossless @@ -367,7 +367,7 @@ public class ByteBloomFilter implements BloomFilter { @Override public void write(DataOutput out) throws IOException { out.writeInt(VERSION); - out.writeInt(byteSize); + out.writeInt((int)byteSize); out.writeInt(hashCount); out.writeInt(hashType); out.writeInt(keyCount); diff --git a/src/test/java/org/apache/hadoop/hbase/regionserver/TestStoreFile.java b/src/test/java/org/apache/hadoop/hbase/regionserver/TestStoreFile.java index 38ef52001ee..0a6872b3eae 100644 --- a/src/test/java/org/apache/hadoop/hbase/regionserver/TestStoreFile.java +++ b/src/test/java/org/apache/hadoop/hbase/regionserver/TestStoreFile.java @@ -40,7 +40,9 @@ import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.Reference.Range; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFileScanner; +import org.apache.hadoop.hbase.util.ByteBloomFilter; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Hash; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.mockito.Mockito; @@ -322,18 +324,11 @@ public class TestStoreFile extends HBaseTestCase { private static String ROOT_DIR = HBaseTestingUtility.getTestDir("TestStoreFile").toString(); private static String localFormatter = "%010d"; - - public void testBloomFilter() throws Exception { - FileSystem fs = FileSystem.getLocal(conf); - conf.setFloat("io.hfile.bloom.error.rate", (float)0.01); - conf.setBoolean("io.hfile.bloom.enabled", true); - - // write the file - Path f = new Path(ROOT_DIR, getName()); - StoreFile.Writer writer = new StoreFile.Writer(fs, f, - StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM, - conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000); - + + private void bloomWriteRead(StoreFile.Writer writer, FileSystem fs) + throws Exception { + float err = conf.getFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, 0); + Path f = writer.getPath(); long now = System.currentTimeMillis(); for (int i = 0; i < 2000; i += 2) { String row = String.format(localFormatter, i); @@ -370,14 +365,31 @@ public class TestStoreFile extends HBaseTestCase { System.out.println("False negatives: " + falseNeg); assertEquals(0, falseNeg); System.out.println("False positives: " + falsePos); - assertTrue(falsePos < 2); + if (!(falsePos <= 2* 2000 * err)) { + System.out.println("WTFBBQ! " + falsePos + ", " + (2* 2000 * err) ); + } + assertTrue(falsePos <= 2* 2000 * err); + } + + public void testBloomFilter() throws Exception { + FileSystem fs = FileSystem.getLocal(conf); + conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, (float)0.01); + conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true); + + // write the file + Path f = new Path(ROOT_DIR, getName()); + StoreFile.Writer writer = new StoreFile.Writer(fs, f, + StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM, + conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000); + + bloomWriteRead(writer, fs); } public void testBloomTypes() throws Exception { float err = (float) 0.01; FileSystem fs = FileSystem.getLocal(conf); - conf.setFloat("io.hfile.bloom.error.rate", err); - conf.setBoolean("io.hfile.bloom.enabled", true); + conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, err); + conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true); int rowCount = 50; int colCount = 10; @@ -455,6 +467,45 @@ public class TestStoreFile extends HBaseTestCase { } } + public void testBloomEdgeCases() throws Exception { + float err = (float)0.005; + FileSystem fs = FileSystem.getLocal(conf); + Path f = new Path(ROOT_DIR, getName()); + conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, err); + conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true); + conf.setInt(StoreFile.IO_STOREFILE_BLOOM_MAX_KEYS, 1000); + + // this should not create a bloom because the max keys is too small + StoreFile.Writer writer = new StoreFile.Writer(fs, f, + StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM, + conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000); + assertFalse(writer.hasBloom()); + writer.close(); + fs.delete(f, true); + + conf.setInt(StoreFile.IO_STOREFILE_BLOOM_MAX_KEYS, Integer.MAX_VALUE); + + // TODO: commented out because we run out of java heap space on trunk + /* + // the below config caused IllegalArgumentException in our production cluster + // however, the resulting byteSize is < MAX_INT, so this should work properly + writer = new StoreFile.Writer(fs, f, + StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM, + conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 272446963); + assertTrue(writer.hasBloom()); + bloomWriteRead(writer, fs); + */ + + // this, however, is too large and should not create a bloom + // because Java can't create a contiguous array > MAX_INT + writer = new StoreFile.Writer(fs, f, + StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM, + conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, Integer.MAX_VALUE); + assertFalse(writer.hasBloom()); + writer.close(); + fs.delete(f, true); + } + public void testFlushTimeComparator() { assertOrdering(StoreFile.Comparators.FLUSH_TIME, mockStoreFile(true, 1000, -1, "/foo/123"),