HBASE-744 BloomFilter serialization/deserialization broken
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@676728 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5162324ef9
commit
8c2399fee8
|
@ -183,6 +183,7 @@ Trunk (unreleased changes)
|
||||||
HBASE-742 Rename getMetainfo in HTable as getTableDescriptor
|
HBASE-742 Rename getMetainfo in HTable as getTableDescriptor
|
||||||
HBASE-739 HBaseAdmin.createTable() using old HTableDescription doesn't work
|
HBASE-739 HBaseAdmin.createTable() using old HTableDescription doesn't work
|
||||||
(Izaak Rubin via Stack)
|
(Izaak Rubin via Stack)
|
||||||
|
HBASE-744 BloomFilter serialization/deserialization broken
|
||||||
|
|
||||||
IMPROVEMENTS
|
IMPROVEMENTS
|
||||||
HBASE-559 MR example job to count table rows
|
HBASE-559 MR example job to count table rows
|
||||||
|
|
|
@ -226,6 +226,7 @@ public class BloomFilter extends Filter {
|
||||||
@Override
|
@Override
|
||||||
public void readFields(DataInput in) throws IOException {
|
public void readFields(DataInput in) throws IOException {
|
||||||
super.readFields(in);
|
super.readFields(in);
|
||||||
|
bits = new BitSet(this.vectorSize);
|
||||||
byte[] bytes = new byte[getNBytes()];
|
byte[] bytes = new byte[getNBytes()];
|
||||||
in.readFully(bytes);
|
in.readFully(bytes);
|
||||||
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
|
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
|
||||||
|
@ -233,9 +234,6 @@ public class BloomFilter extends Filter {
|
||||||
bitIndex = 0;
|
bitIndex = 0;
|
||||||
byteIndex++;
|
byteIndex++;
|
||||||
}
|
}
|
||||||
if (bitIndex == 0) {
|
|
||||||
bytes[byteIndex] = 0;
|
|
||||||
}
|
|
||||||
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
|
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
|
||||||
bits.set(i);
|
bits.set(i);
|
||||||
}
|
}
|
||||||
|
|
|
@ -76,13 +76,13 @@ import org.apache.hadoop.io.Writable;
|
||||||
*/
|
*/
|
||||||
public abstract class Filter implements Writable {
|
public abstract class Filter implements Writable {
|
||||||
/** The vector size of <i>this</i> filter. */
|
/** The vector size of <i>this</i> filter. */
|
||||||
int vectorSize;
|
protected int vectorSize;
|
||||||
|
|
||||||
/** The hash function used to map a key to several positions in the vector. */
|
/** The hash function used to map a key to several positions in the vector. */
|
||||||
protected HashFunction hash;
|
protected HashFunction hash;
|
||||||
|
|
||||||
/** The number of hash function to consider. */
|
/** The number of hash function to consider. */
|
||||||
int nbHash;
|
protected int nbHash;
|
||||||
|
|
||||||
protected Filter() {}
|
protected Filter() {}
|
||||||
|
|
||||||
|
|
|
@ -48,9 +48,16 @@
|
||||||
*/
|
*/
|
||||||
package org.onelab.test;
|
package org.onelab.test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.DataOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.onelab.filter.*;
|
import org.onelab.filter.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -61,22 +68,196 @@ import org.onelab.filter.*;
|
||||||
* @version 1.0 - 8 Feb. 07
|
* @version 1.0 - 8 Feb. 07
|
||||||
*/
|
*/
|
||||||
public class TestFilter extends TestCase {
|
public class TestFilter extends TestCase {
|
||||||
|
private static final Log LOG = LogFactory.getLog(TestFilter.class);
|
||||||
|
|
||||||
/** Test a BloomFilter
|
/** Test a BloomFilter
|
||||||
* @throws UnsupportedEncodingException
|
* @throws UnsupportedEncodingException
|
||||||
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public void testBloomFilter() throws UnsupportedEncodingException {
|
public void testBloomFilter() throws UnsupportedEncodingException,
|
||||||
Filter bf = new BloomFilter(8, 2);
|
IOException {
|
||||||
Key key = new StringKey("toto");
|
final StringKey[] inserted = {
|
||||||
Key k2 = new StringKey("lulu");
|
new StringKey("wmjwjzyv"),
|
||||||
Key k3 = new StringKey("mama");
|
new StringKey("baietibz"),
|
||||||
bf.add(key);
|
new StringKey("guhsgxnv"),
|
||||||
bf.add(k2);
|
new StringKey("mhnqycto"),
|
||||||
bf.add(k3);
|
new StringKey("xcyqafgz"),
|
||||||
assertTrue(bf.membershipTest(key));
|
new StringKey("zidoamgb"),
|
||||||
assertTrue(bf.membershipTest(new StringKey("graknyl")));
|
new StringKey("tftfirzd"),
|
||||||
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
|
new StringKey("okapqlrg"),
|
||||||
assertFalse(bf.membershipTest(new StringKey("abcd")));
|
new StringKey("yccwzwsq"),
|
||||||
|
new StringKey("qmonufqu"),
|
||||||
|
new StringKey("wlsctews"),
|
||||||
|
new StringKey("mksdhqri"),
|
||||||
|
new StringKey("wxxllokj"),
|
||||||
|
new StringKey("eviuqpls"),
|
||||||
|
new StringKey("bavotqmj"),
|
||||||
|
new StringKey("yibqzhdl"),
|
||||||
|
new StringKey("csfqmsyr"),
|
||||||
|
new StringKey("guxliyuh"),
|
||||||
|
new StringKey("pzicietj"),
|
||||||
|
new StringKey("qdwgrqwo"),
|
||||||
|
new StringKey("ujfzecmi"),
|
||||||
|
new StringKey("dzeqfvfi"),
|
||||||
|
new StringKey("phoegsij"),
|
||||||
|
new StringKey("bvudfcou"),
|
||||||
|
new StringKey("dowzmciz"),
|
||||||
|
new StringKey("etvhkizp"),
|
||||||
|
new StringKey("rzurqycg"),
|
||||||
|
new StringKey("krqfxuge"),
|
||||||
|
new StringKey("gflcohtd"),
|
||||||
|
new StringKey("fcrcxtps"),
|
||||||
|
new StringKey("qrtovxdq"),
|
||||||
|
new StringKey("aypxwrwi"),
|
||||||
|
new StringKey("dckpyznr"),
|
||||||
|
new StringKey("mdaawnpz"),
|
||||||
|
new StringKey("pakdfvca"),
|
||||||
|
new StringKey("xjglfbez"),
|
||||||
|
new StringKey("xdsecofi"),
|
||||||
|
new StringKey("sjlrfcab"),
|
||||||
|
new StringKey("ebcjawxv"),
|
||||||
|
new StringKey("hkafkjmy"),
|
||||||
|
new StringKey("oimmwaxo"),
|
||||||
|
new StringKey("qcuzrazo"),
|
||||||
|
new StringKey("nqydfkwk"),
|
||||||
|
new StringKey("frybvmlb"),
|
||||||
|
new StringKey("amxmaqws"),
|
||||||
|
new StringKey("gtkovkgx"),
|
||||||
|
new StringKey("vgwxrwss"),
|
||||||
|
new StringKey("xrhzmcep"),
|
||||||
|
new StringKey("tafwziil"),
|
||||||
|
new StringKey("erjmncnv"),
|
||||||
|
new StringKey("heyzqzrn"),
|
||||||
|
new StringKey("sowvyhtu"),
|
||||||
|
new StringKey("heeixgzy"),
|
||||||
|
new StringKey("ktcahcob"),
|
||||||
|
new StringKey("ljhbybgg"),
|
||||||
|
new StringKey("jiqfcksl"),
|
||||||
|
new StringKey("anjdkjhm"),
|
||||||
|
new StringKey("uzcgcuxp"),
|
||||||
|
new StringKey("vzdhjqla"),
|
||||||
|
new StringKey("svhgwwzq"),
|
||||||
|
new StringKey("zhswvhbp"),
|
||||||
|
new StringKey("ueceybwy"),
|
||||||
|
new StringKey("czkqykcw"),
|
||||||
|
new StringKey("ctisayir"),
|
||||||
|
new StringKey("hppbgciu"),
|
||||||
|
new StringKey("nhzgljfk"),
|
||||||
|
new StringKey("vaziqllf"),
|
||||||
|
new StringKey("narvrrij"),
|
||||||
|
new StringKey("kcevbbqi"),
|
||||||
|
new StringKey("qymuaqnp"),
|
||||||
|
new StringKey("pwqpfhsr"),
|
||||||
|
new StringKey("peyeicuk"),
|
||||||
|
new StringKey("kudlwihi"),
|
||||||
|
new StringKey("pkmqejlm"),
|
||||||
|
new StringKey("ylwzjftl"),
|
||||||
|
new StringKey("rhqrlqar"),
|
||||||
|
new StringKey("xmftvzsp"),
|
||||||
|
new StringKey("iaemtihk"),
|
||||||
|
new StringKey("ymsbrqcu"),
|
||||||
|
new StringKey("yfnlcxto"),
|
||||||
|
new StringKey("nluqopqh"),
|
||||||
|
new StringKey("wmrzhtox"),
|
||||||
|
new StringKey("qnffhqbl"),
|
||||||
|
new StringKey("zypqpnbw"),
|
||||||
|
new StringKey("oiokhatd"),
|
||||||
|
new StringKey("mdraddiu"),
|
||||||
|
new StringKey("zqoatltt"),
|
||||||
|
new StringKey("ewhulbtm"),
|
||||||
|
new StringKey("nmswpsdf"),
|
||||||
|
new StringKey("xsjeteqe"),
|
||||||
|
new StringKey("ufubcbma"),
|
||||||
|
new StringKey("phyxvrds"),
|
||||||
|
new StringKey("vhnfldap"),
|
||||||
|
new StringKey("zrrlycmg"),
|
||||||
|
new StringKey("becotcjx"),
|
||||||
|
new StringKey("wvbubokn"),
|
||||||
|
new StringKey("avkgiopr"),
|
||||||
|
new StringKey("mbqqxmrv"),
|
||||||
|
new StringKey("ibplgvuu"),
|
||||||
|
new StringKey("dghvpkgc")
|
||||||
|
};
|
||||||
|
|
||||||
|
final StringKey[] notInserted = {
|
||||||
|
new StringKey("abcdefgh"),
|
||||||
|
new StringKey("ijklmnop"),
|
||||||
|
new StringKey("qrstuvwx"),
|
||||||
|
new StringKey("yzabcdef")
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Bloom filters are very sensitive to the number of elements inserted into
|
||||||
|
* them.
|
||||||
|
*
|
||||||
|
* If m denotes the number of bits in the Bloom filter (vectorSize),
|
||||||
|
* n denotes the number of elements inserted into the Bloom filter and
|
||||||
|
* k represents the number of hash functions used (nbHash), then
|
||||||
|
* according to Broder and Mitzenmacher,
|
||||||
|
*
|
||||||
|
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
|
||||||
|
*
|
||||||
|
* the probability of false positives is minimized when k is
|
||||||
|
* approximately ln(2) * m/n.
|
||||||
|
*
|
||||||
|
* If we fix the number of hash functions and know the number of entries,
|
||||||
|
* then the optimal vector size m = (k * n) / ln(2)
|
||||||
|
*/
|
||||||
|
final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
|
||||||
|
BloomFilter bf = new BloomFilter(
|
||||||
|
(int) Math.ceil(
|
||||||
|
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
|
||||||
|
Math.log(2.0)),
|
||||||
|
DEFAULT_NUMBER_OF_HASH_FUNCTIONS
|
||||||
|
);
|
||||||
|
|
||||||
|
for (int i = 0; i < inserted.length; i++) {
|
||||||
|
bf.add(inserted[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that there are no false negatives and few (if any) false positives
|
||||||
|
|
||||||
|
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||||
|
|
||||||
|
// Test serialization/deserialization
|
||||||
|
|
||||||
|
LOG.info("Checking serialization/deserialization");
|
||||||
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
|
DataOutputStream out = new DataOutputStream(baos);
|
||||||
|
bf.write(out);
|
||||||
|
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
||||||
|
DataInputStream in = new DataInputStream(bais);
|
||||||
|
bf = new BloomFilter();
|
||||||
|
bf.readFields(in);
|
||||||
|
|
||||||
|
// Verify that there are no false negatives and few (if any) false positives
|
||||||
|
|
||||||
|
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkFalsePositivesNegatives(BloomFilter bf,
|
||||||
|
StringKey[] inserted, StringKey[] notInserted) {
|
||||||
|
// Test membership for values we inserted. Should not get false negatives
|
||||||
|
|
||||||
|
LOG.info("Checking for false negatives");
|
||||||
|
for (int i = 0; i < inserted.length; i++) {
|
||||||
|
if (!bf.membershipTest(inserted[i])) {
|
||||||
|
LOG.error("false negative for: " + inserted[i]);
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test membership for values we did not insert. It is possible to get
|
||||||
|
// false positives
|
||||||
|
|
||||||
|
LOG.info("Checking for false positives");
|
||||||
|
for (int i = 0; i < notInserted.length; i++) {
|
||||||
|
if(bf.membershipTest(notInserted[i])) {
|
||||||
|
LOG.error("false positive for: " + notInserted[i]);
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG.info("Success!");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test a CountingBloomFilter
|
/** Test a CountingBloomFilter
|
||||||
|
|
Loading…
Reference in New Issue