HBASE-744 BloomFilter serialization/deserialization broken
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@676728 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5162324ef9
commit
8c2399fee8
|
@ -183,6 +183,7 @@ Trunk (unreleased changes)
|
|||
HBASE-742 Rename getMetainfo in HTable as getTableDescriptor
|
||||
HBASE-739 HBaseAdmin.createTable() using old HTableDescription doesn't work
|
||||
(Izaak Rubin via Stack)
|
||||
HBASE-744 BloomFilter serialization/deserialization broken
|
||||
|
||||
IMPROVEMENTS
|
||||
HBASE-559 MR example job to count table rows
|
||||
|
|
|
@ -226,6 +226,7 @@ public class BloomFilter extends Filter {
|
|||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
super.readFields(in);
|
||||
bits = new BitSet(this.vectorSize);
|
||||
byte[] bytes = new byte[getNBytes()];
|
||||
in.readFully(bytes);
|
||||
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
|
||||
|
@ -233,9 +234,6 @@ public class BloomFilter extends Filter {
|
|||
bitIndex = 0;
|
||||
byteIndex++;
|
||||
}
|
||||
if (bitIndex == 0) {
|
||||
bytes[byteIndex] = 0;
|
||||
}
|
||||
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
|
||||
bits.set(i);
|
||||
}
|
||||
|
|
|
@ -76,13 +76,13 @@ import org.apache.hadoop.io.Writable;
|
|||
*/
|
||||
public abstract class Filter implements Writable {
|
||||
/** The vector size of <i>this</i> filter. */
|
||||
int vectorSize;
|
||||
protected int vectorSize;
|
||||
|
||||
/** The hash function used to map a key to several positions in the vector. */
|
||||
protected HashFunction hash;
|
||||
|
||||
/** The number of hash function to consider. */
|
||||
int nbHash;
|
||||
protected int nbHash;
|
||||
|
||||
protected Filter() {}
|
||||
|
||||
|
|
|
@ -48,9 +48,16 @@
|
|||
*/
|
||||
package org.onelab.test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.onelab.filter.*;
|
||||
|
||||
/**
|
||||
|
@ -61,22 +68,196 @@ import org.onelab.filter.*;
|
|||
* @version 1.0 - 8 Feb. 07
|
||||
*/
|
||||
public class TestFilter extends TestCase {
|
||||
private static final Log LOG = LogFactory.getLog(TestFilter.class);
|
||||
|
||||
/** Test a BloomFilter
|
||||
* @throws UnsupportedEncodingException
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testBloomFilter() throws UnsupportedEncodingException {
|
||||
Filter bf = new BloomFilter(8, 2);
|
||||
Key key = new StringKey("toto");
|
||||
Key k2 = new StringKey("lulu");
|
||||
Key k3 = new StringKey("mama");
|
||||
bf.add(key);
|
||||
bf.add(k2);
|
||||
bf.add(k3);
|
||||
assertTrue(bf.membershipTest(key));
|
||||
assertTrue(bf.membershipTest(new StringKey("graknyl")));
|
||||
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
|
||||
assertFalse(bf.membershipTest(new StringKey("abcd")));
|
||||
public void testBloomFilter() throws UnsupportedEncodingException,
|
||||
IOException {
|
||||
final StringKey[] inserted = {
|
||||
new StringKey("wmjwjzyv"),
|
||||
new StringKey("baietibz"),
|
||||
new StringKey("guhsgxnv"),
|
||||
new StringKey("mhnqycto"),
|
||||
new StringKey("xcyqafgz"),
|
||||
new StringKey("zidoamgb"),
|
||||
new StringKey("tftfirzd"),
|
||||
new StringKey("okapqlrg"),
|
||||
new StringKey("yccwzwsq"),
|
||||
new StringKey("qmonufqu"),
|
||||
new StringKey("wlsctews"),
|
||||
new StringKey("mksdhqri"),
|
||||
new StringKey("wxxllokj"),
|
||||
new StringKey("eviuqpls"),
|
||||
new StringKey("bavotqmj"),
|
||||
new StringKey("yibqzhdl"),
|
||||
new StringKey("csfqmsyr"),
|
||||
new StringKey("guxliyuh"),
|
||||
new StringKey("pzicietj"),
|
||||
new StringKey("qdwgrqwo"),
|
||||
new StringKey("ujfzecmi"),
|
||||
new StringKey("dzeqfvfi"),
|
||||
new StringKey("phoegsij"),
|
||||
new StringKey("bvudfcou"),
|
||||
new StringKey("dowzmciz"),
|
||||
new StringKey("etvhkizp"),
|
||||
new StringKey("rzurqycg"),
|
||||
new StringKey("krqfxuge"),
|
||||
new StringKey("gflcohtd"),
|
||||
new StringKey("fcrcxtps"),
|
||||
new StringKey("qrtovxdq"),
|
||||
new StringKey("aypxwrwi"),
|
||||
new StringKey("dckpyznr"),
|
||||
new StringKey("mdaawnpz"),
|
||||
new StringKey("pakdfvca"),
|
||||
new StringKey("xjglfbez"),
|
||||
new StringKey("xdsecofi"),
|
||||
new StringKey("sjlrfcab"),
|
||||
new StringKey("ebcjawxv"),
|
||||
new StringKey("hkafkjmy"),
|
||||
new StringKey("oimmwaxo"),
|
||||
new StringKey("qcuzrazo"),
|
||||
new StringKey("nqydfkwk"),
|
||||
new StringKey("frybvmlb"),
|
||||
new StringKey("amxmaqws"),
|
||||
new StringKey("gtkovkgx"),
|
||||
new StringKey("vgwxrwss"),
|
||||
new StringKey("xrhzmcep"),
|
||||
new StringKey("tafwziil"),
|
||||
new StringKey("erjmncnv"),
|
||||
new StringKey("heyzqzrn"),
|
||||
new StringKey("sowvyhtu"),
|
||||
new StringKey("heeixgzy"),
|
||||
new StringKey("ktcahcob"),
|
||||
new StringKey("ljhbybgg"),
|
||||
new StringKey("jiqfcksl"),
|
||||
new StringKey("anjdkjhm"),
|
||||
new StringKey("uzcgcuxp"),
|
||||
new StringKey("vzdhjqla"),
|
||||
new StringKey("svhgwwzq"),
|
||||
new StringKey("zhswvhbp"),
|
||||
new StringKey("ueceybwy"),
|
||||
new StringKey("czkqykcw"),
|
||||
new StringKey("ctisayir"),
|
||||
new StringKey("hppbgciu"),
|
||||
new StringKey("nhzgljfk"),
|
||||
new StringKey("vaziqllf"),
|
||||
new StringKey("narvrrij"),
|
||||
new StringKey("kcevbbqi"),
|
||||
new StringKey("qymuaqnp"),
|
||||
new StringKey("pwqpfhsr"),
|
||||
new StringKey("peyeicuk"),
|
||||
new StringKey("kudlwihi"),
|
||||
new StringKey("pkmqejlm"),
|
||||
new StringKey("ylwzjftl"),
|
||||
new StringKey("rhqrlqar"),
|
||||
new StringKey("xmftvzsp"),
|
||||
new StringKey("iaemtihk"),
|
||||
new StringKey("ymsbrqcu"),
|
||||
new StringKey("yfnlcxto"),
|
||||
new StringKey("nluqopqh"),
|
||||
new StringKey("wmrzhtox"),
|
||||
new StringKey("qnffhqbl"),
|
||||
new StringKey("zypqpnbw"),
|
||||
new StringKey("oiokhatd"),
|
||||
new StringKey("mdraddiu"),
|
||||
new StringKey("zqoatltt"),
|
||||
new StringKey("ewhulbtm"),
|
||||
new StringKey("nmswpsdf"),
|
||||
new StringKey("xsjeteqe"),
|
||||
new StringKey("ufubcbma"),
|
||||
new StringKey("phyxvrds"),
|
||||
new StringKey("vhnfldap"),
|
||||
new StringKey("zrrlycmg"),
|
||||
new StringKey("becotcjx"),
|
||||
new StringKey("wvbubokn"),
|
||||
new StringKey("avkgiopr"),
|
||||
new StringKey("mbqqxmrv"),
|
||||
new StringKey("ibplgvuu"),
|
||||
new StringKey("dghvpkgc")
|
||||
};
|
||||
|
||||
final StringKey[] notInserted = {
|
||||
new StringKey("abcdefgh"),
|
||||
new StringKey("ijklmnop"),
|
||||
new StringKey("qrstuvwx"),
|
||||
new StringKey("yzabcdef")
|
||||
};
|
||||
|
||||
/*
|
||||
* Bloom filters are very sensitive to the number of elements inserted into
|
||||
* them.
|
||||
*
|
||||
* If m denotes the number of bits in the Bloom filter (vectorSize),
|
||||
* n denotes the number of elements inserted into the Bloom filter and
|
||||
* k represents the number of hash functions used (nbHash), then
|
||||
* according to Broder and Mitzenmacher,
|
||||
*
|
||||
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
|
||||
*
|
||||
* the probability of false positives is minimized when k is
|
||||
* approximately ln(2) * m/n.
|
||||
*
|
||||
* If we fix the number of hash functions and know the number of entries,
|
||||
* then the optimal vector size m = (k * n) / ln(2)
|
||||
*/
|
||||
final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
|
||||
BloomFilter bf = new BloomFilter(
|
||||
(int) Math.ceil(
|
||||
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
|
||||
Math.log(2.0)),
|
||||
DEFAULT_NUMBER_OF_HASH_FUNCTIONS
|
||||
);
|
||||
|
||||
for (int i = 0; i < inserted.length; i++) {
|
||||
bf.add(inserted[i]);
|
||||
}
|
||||
|
||||
// Verify that there are no false negatives and few (if any) false positives
|
||||
|
||||
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||
|
||||
// Test serialization/deserialization
|
||||
|
||||
LOG.info("Checking serialization/deserialization");
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
DataOutputStream out = new DataOutputStream(baos);
|
||||
bf.write(out);
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
||||
DataInputStream in = new DataInputStream(bais);
|
||||
bf = new BloomFilter();
|
||||
bf.readFields(in);
|
||||
|
||||
// Verify that there are no false negatives and few (if any) false positives
|
||||
|
||||
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||
}
|
||||
|
||||
private void checkFalsePositivesNegatives(BloomFilter bf,
|
||||
StringKey[] inserted, StringKey[] notInserted) {
|
||||
// Test membership for values we inserted. Should not get false negatives
|
||||
|
||||
LOG.info("Checking for false negatives");
|
||||
for (int i = 0; i < inserted.length; i++) {
|
||||
if (!bf.membershipTest(inserted[i])) {
|
||||
LOG.error("false negative for: " + inserted[i]);
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
// Test membership for values we did not insert. It is possible to get
|
||||
// false positives
|
||||
|
||||
LOG.info("Checking for false positives");
|
||||
for (int i = 0; i < notInserted.length; i++) {
|
||||
if(bf.membershipTest(notInserted[i])) {
|
||||
LOG.error("false positive for: " + notInserted[i]);
|
||||
fail();
|
||||
}
|
||||
}
|
||||
LOG.info("Success!");
|
||||
}
|
||||
|
||||
/** Test a CountingBloomFilter
|
||||
|
|
Loading…
Reference in New Issue