HBASE-3158 Bloom File Writes Broken if keySize is large

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1027810 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-10-27 04:15:26 +00:00
parent 97f7976755
commit f9fda11ec0
4 changed files with 117 additions and 40 deletions

View File

@ -615,6 +615,8 @@ Release 0.21.0 - Unreleased
HBASE-2753 Remove sorted() methods from Result now that Gets are Scans
HBASE-3147 Regions stuck in transition after rolling restart, perpetual
timeout handling but nothing happens
HBASE-3158 Bloom File Writes Broken if keySize is large
(Nicolas Spiegelberg via Stack)
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable

View File

@ -83,6 +83,7 @@ public class StoreFile {
// Config keys.
static final String IO_STOREFILE_BLOOM_ERROR_RATE = "io.storefile.bloom.error.rate";
static final String IO_STOREFILE_BLOOM_MAX_FOLD = "io.storefile.bloom.max.fold";
static final String IO_STOREFILE_BLOOM_MAX_KEYS = "io.storefile.bloom.max.keys";
static final String IO_STOREFILE_BLOOM_ENABLED = "io.storefile.bloom.enabled";
static final String HFILE_BLOCK_CACHE_SIZE_KEY = "hfile.block.cache.size";
@ -691,6 +692,9 @@ public class StoreFile {
this.kvComparator = comparator;
BloomFilter bloom = null;
BloomType bt = BloomType.NONE;
if (bloomType != BloomType.NONE && conf != null) {
float err = conf.getFloat(IO_STOREFILE_BLOOM_ERROR_RATE, (float)0.01);
// Since in row+col blooms we have 2 calls to shouldSeek() instead of 1
@ -701,15 +705,31 @@ public class StoreFile {
err /= 2;
}
int maxFold = conf.getInt(IO_STOREFILE_BLOOM_MAX_FOLD, 7);
int tooBig = conf.getInt(IO_STOREFILE_BLOOM_MAX_KEYS, 128*1000*1000);
this.bloomFilter = new ByteBloomFilter(maxKeys, err,
Hash.getHashType(conf), maxFold);
this.bloomFilter.allocBloom();
this.bloomType = bloomType;
} else {
this.bloomFilter = null;
this.bloomType = BloomType.NONE;
if (maxKeys < tooBig) {
try {
bloom = new ByteBloomFilter(maxKeys, err,
Hash.getHashType(conf), maxFold);
bloom.allocBloom();
bt = bloomType;
} catch (IllegalArgumentException iae) {
LOG.warn(String.format(
"Parse error while creating bloom for %s (%d, %f)",
path, maxKeys, err), iae);
bloom = null;
bt = BloomType.NONE;
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping bloom filter because max keysize too large: "
+ maxKeys);
}
}
}
this.bloomFilter = bloom;
this.bloomType = bt;
}
/**
@ -821,6 +841,10 @@ public class StoreFile {
return this.writer.getPath();
}
boolean hasBloom() {
return this.bloomFilter != null;
}
public void append(final byte [] key, final byte [] value) throws IOException {
if (this.bloomFilter != null) {
// only add to the bloom filter on a new row

View File

@ -52,7 +52,7 @@ public class ByteBloomFilter implements BloomFilter {
public static final int VERSION = 1;
/** Bytes (B) in the array */
protected int byteSize;
protected long byteSize;
/** Number of hash functions */
protected final int hashCount;
/** Hash type */
@ -134,11 +134,11 @@ public class ByteBloomFilter implements BloomFilter {
*
* The probability of false positives is minimized when k = m/n ln(2).
*/
int bitSize = (int)Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185)));
long bitSize = (long)Math.ceil(maxKeys * (Math.log(errorRate) / Math.log(0.6185)));
int functionCount = (int)Math.ceil(Math.log(2) * (bitSize / maxKeys));
// increase byteSize so folding is possible
int byteSize = (bitSize + 7) / 8;
long byteSize = (bitSize + 7) / 8;
int mask = (1 << foldFactor) - 1;
if ( (mask & byteSize) != 0) {
byteSize >>= foldFactor;
@ -161,13 +161,13 @@ public class ByteBloomFilter implements BloomFilter {
if (this.bloom != null) {
throw new IllegalArgumentException("can only create bloom once.");
}
this.bloom = ByteBuffer.allocate(this.byteSize);
this.bloom = ByteBuffer.allocate((int)this.byteSize);
assert this.bloom.hasArray();
}
void sanityCheck() throws IllegalArgumentException {
if(this.byteSize <= 0) {
throw new IllegalArgumentException("maxValue must be > 0");
if(0 >= this.byteSize || this.byteSize > Integer.MAX_VALUE) {
throw new IllegalArgumentException("Invalid byteSize: " + this.byteSize);
}
if(this.hashCount <= 0) {
@ -205,7 +205,7 @@ public class ByteBloomFilter implements BloomFilter {
int hash2 = this.hash.hash(buf, offset, len, hash1);
for (int i = 0; i < this.hashCount; i++) {
int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
set(hashLoc);
}
@ -243,7 +243,7 @@ public class ByteBloomFilter implements BloomFilter {
int hash2 = this.hash.hash(buf, offset, length, hash1);
for (int i = 0; i < this.hashCount; i++) {
int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
long hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
if (!get(hashLoc, theBloom) ) {
return false;
}
@ -259,9 +259,9 @@ public class ByteBloomFilter implements BloomFilter {
*
* @param pos index of bit
*/
void set(int pos) {
int bytePos = pos / 8;
int bitPos = pos % 8;
void set(long pos) {
int bytePos = (int)(pos / 8);
int bitPos = (int)(pos % 8);
byte curByte = bloom.get(bytePos);
curByte |= bitvals[bitPos];
bloom.put(bytePos, curByte);
@ -273,9 +273,9 @@ public class ByteBloomFilter implements BloomFilter {
* @param pos index of bit
* @return true if bit at specified index is 1, false if 0.
*/
static boolean get(int pos, ByteBuffer theBloom) {
int bytePos = pos / 8;
int bitPos = pos % 8;
static boolean get(long pos, ByteBuffer theBloom) {
int bytePos = (int)(pos / 8);
int bitPos = (int)(pos % 8);
byte curByte = theBloom.get(bytePos);
curByte &= bitvals[bitPos];
return (curByte != 0);
@ -293,7 +293,7 @@ public class ByteBloomFilter implements BloomFilter {
@Override
public int getByteSize() {
return this.byteSize;
return (int)this.byteSize;
}
@Override
@ -301,7 +301,7 @@ public class ByteBloomFilter implements BloomFilter {
// see if the actual size is exponentially smaller than expected.
if (this.keyCount > 0 && this.bloom.hasArray()) {
int pieces = 1;
int newByteSize = this.byteSize;
int newByteSize = (int)this.byteSize;
int newMaxKeys = this.maxKeys;
// while exponentially smaller & folding is lossless
@ -367,7 +367,7 @@ public class ByteBloomFilter implements BloomFilter {
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(VERSION);
out.writeInt(byteSize);
out.writeInt((int)byteSize);
out.writeInt(hashCount);
out.writeInt(hashType);
out.writeInt(keyCount);

View File

@ -40,7 +40,9 @@ import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.Reference.Range;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.hbase.util.ByteBloomFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Hash;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.mockito.Mockito;
@ -323,17 +325,10 @@ public class TestStoreFile extends HBaseTestCase {
HBaseTestingUtility.getTestDir("TestStoreFile").toString();
private static String localFormatter = "%010d";
public void testBloomFilter() throws Exception {
FileSystem fs = FileSystem.getLocal(conf);
conf.setFloat("io.hfile.bloom.error.rate", (float)0.01);
conf.setBoolean("io.hfile.bloom.enabled", true);
// write the file
Path f = new Path(ROOT_DIR, getName());
StoreFile.Writer writer = new StoreFile.Writer(fs, f,
StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM,
conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000);
private void bloomWriteRead(StoreFile.Writer writer, FileSystem fs)
throws Exception {
float err = conf.getFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, 0);
Path f = writer.getPath();
long now = System.currentTimeMillis();
for (int i = 0; i < 2000; i += 2) {
String row = String.format(localFormatter, i);
@ -370,14 +365,31 @@ public class TestStoreFile extends HBaseTestCase {
System.out.println("False negatives: " + falseNeg);
assertEquals(0, falseNeg);
System.out.println("False positives: " + falsePos);
assertTrue(falsePos < 2);
if (!(falsePos <= 2* 2000 * err)) {
System.out.println("WTFBBQ! " + falsePos + ", " + (2* 2000 * err) );
}
assertTrue(falsePos <= 2* 2000 * err);
}
public void testBloomFilter() throws Exception {
FileSystem fs = FileSystem.getLocal(conf);
conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, (float)0.01);
conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true);
// write the file
Path f = new Path(ROOT_DIR, getName());
StoreFile.Writer writer = new StoreFile.Writer(fs, f,
StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM,
conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000);
bloomWriteRead(writer, fs);
}
public void testBloomTypes() throws Exception {
float err = (float) 0.01;
FileSystem fs = FileSystem.getLocal(conf);
conf.setFloat("io.hfile.bloom.error.rate", err);
conf.setBoolean("io.hfile.bloom.enabled", true);
conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, err);
conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true);
int rowCount = 50;
int colCount = 10;
@ -455,6 +467,45 @@ public class TestStoreFile extends HBaseTestCase {
}
}
public void testBloomEdgeCases() throws Exception {
float err = (float)0.005;
FileSystem fs = FileSystem.getLocal(conf);
Path f = new Path(ROOT_DIR, getName());
conf.setFloat(StoreFile.IO_STOREFILE_BLOOM_ERROR_RATE, err);
conf.setBoolean(StoreFile.IO_STOREFILE_BLOOM_ENABLED, true);
conf.setInt(StoreFile.IO_STOREFILE_BLOOM_MAX_KEYS, 1000);
// this should not create a bloom because the max keys is too small
StoreFile.Writer writer = new StoreFile.Writer(fs, f,
StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM,
conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 2000);
assertFalse(writer.hasBloom());
writer.close();
fs.delete(f, true);
conf.setInt(StoreFile.IO_STOREFILE_BLOOM_MAX_KEYS, Integer.MAX_VALUE);
// TODO: commented out because we run out of java heap space on trunk
/*
// the below config caused IllegalArgumentException in our production cluster
// however, the resulting byteSize is < MAX_INT, so this should work properly
writer = new StoreFile.Writer(fs, f,
StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM,
conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, 272446963);
assertTrue(writer.hasBloom());
bloomWriteRead(writer, fs);
*/
// this, however, is too large and should not create a bloom
// because Java can't create a contiguous array > MAX_INT
writer = new StoreFile.Writer(fs, f,
StoreFile.DEFAULT_BLOCKSIZE_SMALL, HFile.DEFAULT_COMPRESSION_ALGORITHM,
conf, KeyValue.COMPARATOR, StoreFile.BloomType.ROW, Integer.MAX_VALUE);
assertFalse(writer.hasBloom());
writer.close();
fs.delete(f, true);
}
public void testFlushTimeComparator() {
assertOrdering(StoreFile.Comparators.FLUSH_TIME,
mockStoreFile(true, 1000, -1, "/foo/123"),