HBASE-744 BloomFilter serialization/deserialization broken

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@676728 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jim Kellerman 2008-07-14 20:46:55 +00:00
parent 5162324ef9
commit 8c2399fee8
4 changed files with 197 additions and 17 deletions

View File

@ -183,6 +183,7 @@ Trunk (unreleased changes)
HBASE-742 Rename getMetainfo in HTable as getTableDescriptor
HBASE-739 HBaseAdmin.createTable() using old HTableDescription doesn't work
(Izaak Rubin via Stack)
HBASE-744 BloomFilter serialization/deserialization broken
IMPROVEMENTS
HBASE-559 MR example job to count table rows

View File

@ -226,6 +226,7 @@ public class BloomFilter extends Filter {
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
bits = new BitSet(this.vectorSize);
byte[] bytes = new byte[getNBytes()];
in.readFully(bytes);
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
@ -233,9 +234,6 @@ public class BloomFilter extends Filter {
bitIndex = 0;
byteIndex++;
}
if (bitIndex == 0) {
bytes[byteIndex] = 0;
}
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
bits.set(i);
}

View File

@ -76,13 +76,13 @@ import org.apache.hadoop.io.Writable;
*/
public abstract class Filter implements Writable {
/** The vector size of <i>this</i> filter. */
int vectorSize;
protected int vectorSize;
/** The hash function used to map a key to several positions in the vector. */
protected HashFunction hash;
/** The number of hash function to consider. */
int nbHash;
protected int nbHash;
protected Filter() {}

View File

@ -48,9 +48,16 @@
*/
package org.onelab.test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.onelab.filter.*;
/**
@ -61,22 +68,196 @@ import org.onelab.filter.*;
* @version 1.0 - 8 Feb. 07
*/
public class TestFilter extends TestCase {
private static final Log LOG = LogFactory.getLog(TestFilter.class);
/** Test a BloomFilter
* @throws UnsupportedEncodingException
* @throws IOException
*/
public void testBloomFilter() throws UnsupportedEncodingException {
Filter bf = new BloomFilter(8, 2);
Key key = new StringKey("toto");
Key k2 = new StringKey("lulu");
Key k3 = new StringKey("mama");
bf.add(key);
bf.add(k2);
bf.add(k3);
assertTrue(bf.membershipTest(key));
assertTrue(bf.membershipTest(new StringKey("graknyl")));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
public void testBloomFilter() throws UnsupportedEncodingException,
IOException {
final StringKey[] inserted = {
new StringKey("wmjwjzyv"),
new StringKey("baietibz"),
new StringKey("guhsgxnv"),
new StringKey("mhnqycto"),
new StringKey("xcyqafgz"),
new StringKey("zidoamgb"),
new StringKey("tftfirzd"),
new StringKey("okapqlrg"),
new StringKey("yccwzwsq"),
new StringKey("qmonufqu"),
new StringKey("wlsctews"),
new StringKey("mksdhqri"),
new StringKey("wxxllokj"),
new StringKey("eviuqpls"),
new StringKey("bavotqmj"),
new StringKey("yibqzhdl"),
new StringKey("csfqmsyr"),
new StringKey("guxliyuh"),
new StringKey("pzicietj"),
new StringKey("qdwgrqwo"),
new StringKey("ujfzecmi"),
new StringKey("dzeqfvfi"),
new StringKey("phoegsij"),
new StringKey("bvudfcou"),
new StringKey("dowzmciz"),
new StringKey("etvhkizp"),
new StringKey("rzurqycg"),
new StringKey("krqfxuge"),
new StringKey("gflcohtd"),
new StringKey("fcrcxtps"),
new StringKey("qrtovxdq"),
new StringKey("aypxwrwi"),
new StringKey("dckpyznr"),
new StringKey("mdaawnpz"),
new StringKey("pakdfvca"),
new StringKey("xjglfbez"),
new StringKey("xdsecofi"),
new StringKey("sjlrfcab"),
new StringKey("ebcjawxv"),
new StringKey("hkafkjmy"),
new StringKey("oimmwaxo"),
new StringKey("qcuzrazo"),
new StringKey("nqydfkwk"),
new StringKey("frybvmlb"),
new StringKey("amxmaqws"),
new StringKey("gtkovkgx"),
new StringKey("vgwxrwss"),
new StringKey("xrhzmcep"),
new StringKey("tafwziil"),
new StringKey("erjmncnv"),
new StringKey("heyzqzrn"),
new StringKey("sowvyhtu"),
new StringKey("heeixgzy"),
new StringKey("ktcahcob"),
new StringKey("ljhbybgg"),
new StringKey("jiqfcksl"),
new StringKey("anjdkjhm"),
new StringKey("uzcgcuxp"),
new StringKey("vzdhjqla"),
new StringKey("svhgwwzq"),
new StringKey("zhswvhbp"),
new StringKey("ueceybwy"),
new StringKey("czkqykcw"),
new StringKey("ctisayir"),
new StringKey("hppbgciu"),
new StringKey("nhzgljfk"),
new StringKey("vaziqllf"),
new StringKey("narvrrij"),
new StringKey("kcevbbqi"),
new StringKey("qymuaqnp"),
new StringKey("pwqpfhsr"),
new StringKey("peyeicuk"),
new StringKey("kudlwihi"),
new StringKey("pkmqejlm"),
new StringKey("ylwzjftl"),
new StringKey("rhqrlqar"),
new StringKey("xmftvzsp"),
new StringKey("iaemtihk"),
new StringKey("ymsbrqcu"),
new StringKey("yfnlcxto"),
new StringKey("nluqopqh"),
new StringKey("wmrzhtox"),
new StringKey("qnffhqbl"),
new StringKey("zypqpnbw"),
new StringKey("oiokhatd"),
new StringKey("mdraddiu"),
new StringKey("zqoatltt"),
new StringKey("ewhulbtm"),
new StringKey("nmswpsdf"),
new StringKey("xsjeteqe"),
new StringKey("ufubcbma"),
new StringKey("phyxvrds"),
new StringKey("vhnfldap"),
new StringKey("zrrlycmg"),
new StringKey("becotcjx"),
new StringKey("wvbubokn"),
new StringKey("avkgiopr"),
new StringKey("mbqqxmrv"),
new StringKey("ibplgvuu"),
new StringKey("dghvpkgc")
};
final StringKey[] notInserted = {
new StringKey("abcdefgh"),
new StringKey("ijklmnop"),
new StringKey("qrstuvwx"),
new StringKey("yzabcdef")
};
/*
* Bloom filters are very sensitive to the number of elements inserted into
* them.
*
* If m denotes the number of bits in the Bloom filter (vectorSize),
* n denotes the number of elements inserted into the Bloom filter and
* k represents the number of hash functions used (nbHash), then
* according to Broder and Mitzenmacher,
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
*
* the probability of false positives is minimized when k is
* approximately ln(2) * m/n.
*
* If we fix the number of hash functions and know the number of entries,
* then the optimal vector size m = (k * n) / ln(2)
*/
final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
BloomFilter bf = new BloomFilter(
(int) Math.ceil(
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
Math.log(2.0)),
DEFAULT_NUMBER_OF_HASH_FUNCTIONS
);
for (int i = 0; i < inserted.length; i++) {
bf.add(inserted[i]);
}
// Verify that there are no false negatives and few (if any) false positives
checkFalsePositivesNegatives(bf, inserted, notInserted);
// Test serialization/deserialization
LOG.info("Checking serialization/deserialization");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(baos);
bf.write(out);
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
DataInputStream in = new DataInputStream(bais);
bf = new BloomFilter();
bf.readFields(in);
// Verify that there are no false negatives and few (if any) false positives
checkFalsePositivesNegatives(bf, inserted, notInserted);
}
private void checkFalsePositivesNegatives(BloomFilter bf,
StringKey[] inserted, StringKey[] notInserted) {
// Test membership for values we inserted. Should not get false negatives
LOG.info("Checking for false negatives");
for (int i = 0; i < inserted.length; i++) {
if (!bf.membershipTest(inserted[i])) {
LOG.error("false negative for: " + inserted[i]);
fail();
}
}
// Test membership for values we did not insert. It is possible to get
// false positives
LOG.info("Checking for false positives");
for (int i = 0; i < notInserted.length; i++) {
if(bf.membershipTest(notInserted[i])) {
LOG.error("false positive for: " + notInserted[i]);
fail();
}
}
LOG.info("Success!");
}
/** Test a CountingBloomFilter