HBASE-1381 Remove onelab and bloom filters files from hbase

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@772429 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2009-05-06 21:16:56 +00:00
parent e78e38dce8
commit 2597f74d10
14 changed files with 6 additions and 2590 deletions

View File

@ -17,6 +17,7 @@ Release 0.20.0 - Unreleased
HBASE-1342 Add to filesystem info needed to rebuild .META.
HBASE-1361 Disable bloom filters
HBASE-1367 Get rid of Thrift exception 'NotFound'
HBASE-1381 Remove onelab and bloom filters files from hbase
BUG FIXES
HBASE-1140 "ant clean test" fails (Nitay Joffe via Stack)

View File

@ -1,260 +0,0 @@
/**
* Copyright 2008 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.io;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HStoreKey;
import org.apache.hadoop.hbase.util.Hash;
import org.apache.hadoop.hbase.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.onelab.filter.BloomFilter;
import org.onelab.filter.Key;
/**
* On write, all keys are added to a bloom filter. On read, all keys are
* tested first against bloom filter. Keys are HStoreKey. If passed bloom
* filter is null, just passes invocation to parent.
*/
// TODO should be fixed generic warnings from MapFile methods
@SuppressWarnings("unchecked")
public class BloomFilterMapFile extends HBaseMapFile {
static final Log LOG = LogFactory.getLog(BloomFilterMapFile.class);
protected static final String BLOOMFILTER_FILE_NAME = "filter";
public static class Reader extends HBaseReader {
private final BloomFilter bloomFilter;
/**
* @param fs
* @param dirName
* @param conf
* @param filter
* @param blockCacheEnabled
* @param hri
* @throws IOException
*/
public Reader(FileSystem fs, String dirName, Configuration conf,
final boolean filter, final boolean blockCacheEnabled,
HRegionInfo hri)
throws IOException {
super(fs, dirName, conf, blockCacheEnabled, hri);
if (filter) {
this.bloomFilter = loadBloomFilter(fs, dirName);
} else {
this.bloomFilter = null;
}
}
private BloomFilter loadBloomFilter(FileSystem fs, String dirName)
throws IOException {
Path filterFile = new Path(dirName, BLOOMFILTER_FILE_NAME);
if(!fs.exists(filterFile)) {
LOG.warn("FileNotFound: " + filterFile + "; proceeding without");
return null;
}
BloomFilter filter = new BloomFilter();
FSDataInputStream in = fs.open(filterFile);
try {
filter.readFields(in);
} finally {
in.close();
}
return filter;
}
/**
* @see org.apache.hadoop.hbase.io.MapFile.Reader#get(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
*/
@Override
public Writable get(WritableComparable key, Writable val)
throws IOException {
if (bloomFilter == null) {
return super.get(key, val);
}
if(bloomFilter.membershipTest(getBloomFilterKey(key))) {
if (LOG.isDebugEnabled()) {
LOG.debug("bloom filter reported that key exists");
}
return super.get(key, val);
}
if (LOG.isDebugEnabled()) {
LOG.debug("bloom filter reported that key does not exist");
}
return null;
}
/**
* @see org.apache.hadoop.hbase.io.MapFile.Reader#getClosest(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
*/
@Override
public WritableComparable getClosest(WritableComparable key,
Writable val) throws IOException {
if (bloomFilter == null) {
return super.getClosest(key, val);
}
// Note - the key being passed to us is always a HStoreKey
if(bloomFilter.membershipTest(getBloomFilterKey(key))) {
if (LOG.isDebugEnabled()) {
LOG.debug("bloom filter reported that key exists");
}
return super.getClosest(key, val);
}
if (LOG.isDebugEnabled()) {
LOG.debug("bloom filter reported that key does not exist");
}
return null;
}
/**
* @return size of the bloom filter
*/
public int getBloomFilterSize() {
return bloomFilter == null ? 0 : bloomFilter.getVectorSize();
}
}
public static class Writer extends HBaseWriter {
private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0;
private final BloomFilter bloomFilter;
private final String dirName;
private final FileSystem fs;
/**
* @param conf
* @param fs
* @param dirName
* @param compression
* @param filter
* @param nrows
* @param hri
* @throws IOException
*/
public Writer(Configuration conf, FileSystem fs, String dirName,
SequenceFile.CompressionType compression, final boolean filter,
int nrows, final HRegionInfo hri)
throws IOException {
super(conf, fs, dirName, compression, hri);
this.dirName = dirName;
this.fs = fs;
if (filter) {
/*
* There is no way to automatically determine the vector size and the
* number of hash functions to use. In particular, bloom filters are
* very sensitive to the number of elements inserted into them. For
* HBase, the number of entries depends on the size of the data stored
* in the column. Currently the default region size is 256MB, so the
* number of entries is approximately
* 256MB / (average value size for column).
*
* If m denotes the number of bits in the Bloom filter (vectorSize),
* n denotes the number of elements inserted into the Bloom filter and
* k represents the number of hash functions used (nbHash), then
* according to Broder and Mitzenmacher,
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
*
* the probability of false positives is minimized when k is
* approximately m/n ln(2).
*
* If we fix the number of hash functions and know the number of
* entries, then the optimal vector size m = (k * n) / ln(2)
*/
BloomFilter f = null;
try {
f = new BloomFilter(
(int) Math.ceil(
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * nrows)) /
Math.log(2.0)),
(int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS,
Hash.getHashType(conf)
);
} catch (IllegalArgumentException e) {
LOG.warn("Failed creating bloomfilter; proceeding without", e);
}
this.bloomFilter = f;
} else {
this.bloomFilter = null;
}
}
/**
* @see org.apache.hadoop.hbase.io.MapFile.Writer#append(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
*/
@Override
public void append(WritableComparable key, Writable val)
throws IOException {
if (bloomFilter != null) {
bloomFilter.add(getBloomFilterKey(key));
}
super.append(key, val);
}
/**
* @see org.apache.hadoop.hbase.io.MapFile.Writer#close()
*/
@Override
public synchronized void close() throws IOException {
super.close();
if (this.bloomFilter != null) {
flushBloomFilter();
}
}
/**
* Flushes bloom filter to disk
*
* @throws IOException
*/
private void flushBloomFilter() throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("flushing bloom filter for " + this.dirName);
}
FSDataOutputStream out =
fs.create(new Path(dirName, BLOOMFILTER_FILE_NAME));
try {
bloomFilter.write(out);
} finally {
out.close();
}
if (LOG.isDebugEnabled()) {
LOG.debug("flushed bloom filter for " + this.dirName);
}
}
}
/**
* Custom bloom filter key maker.
* @param key
* @return Key made of bytes of row only.
*/
protected static Key getBloomFilterKey(WritableComparable key) {
return new Key(((HStoreKey) key).getRow());
}
}

View File

@ -44,7 +44,7 @@ import org.apache.hadoop.io.WritableComparable;
* <p>This file is not splitable. Calls to {@link #midKey()} return null.
*/
//TODO should be fixed generic warnings from MapFile methods
public class HalfMapFileReader extends BloomFilterMapFile.Reader {
public class HalfMapFileReader extends HBaseMapFile.HBaseReader {
private final boolean top;
private final HStoreKey midkey;
private boolean firstNextCall = true;
@ -63,7 +63,7 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader {
final WritableComparable<HStoreKey> mk,
final HRegionInfo hri)
throws IOException {
this(fs, dirName, conf, r, mk, false, false, hri);
this(fs, dirName, conf, r, mk, false, hri);
}
/**
@ -72,18 +72,17 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader {
* @param conf
* @param r
* @param mk
* @param filter
* @param blockCacheEnabled
* @param hri
* @throws IOException
*/
public HalfMapFileReader(final FileSystem fs, final String dirName,
final Configuration conf, final Range r,
final WritableComparable<HStoreKey> mk, final boolean filter,
final WritableComparable<HStoreKey> mk,
final boolean blockCacheEnabled,
final HRegionInfo hri)
throws IOException {
super(fs, dirName, conf, filter, blockCacheEnabled, hri);
super(fs, dirName, conf, blockCacheEnabled, hri);
// This is not actual midkey for this half-file; its just border
// around which we split top and bottom. Have to look in files to find
// actual last and first keys for bottom and top halves. Half-files don't
@ -211,4 +210,4 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader {
checkKey(key);
return super.seek(key);
}
}
}

View File

@ -24,9 +24,6 @@ import org.apache.hadoop.conf.Configuration;
* This class represents a common API for hashing functions.
*/
public abstract class Hash {
// TODO: Fix the design tangle that has classes over in org.onelab.filter
// referring to this class. Would need to also move the Jenkins and Murmur
// hashing function too.
/** Constant to denote invalid hash type. */
public static final int INVALID_HASH = -1;
/** Constant to denote {@link JenkinsHash}. */

View File

@ -1,238 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.BitSet;
/**
* Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
* <p>
* The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by
* the networking research community in the past decade thanks to the bandwidth efficiencies that it
* offers for the transmission of set membership information between networked hosts. A sender encodes
* the information into a bit vector, the Bloom filter, that is more compact than a conventional
* representation. Computation and space costs for construction are linear in the number of elements.
* The receiver uses the filter to test whether various elements are members of the set. Though the
* filter will occasionally return a false positive, it will never return a false negative. When creating
* the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 2 Feb. 07
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
*/
public class BloomFilter extends Filter {
private static final byte[] bitvalues = new byte[] {
(byte)0x01,
(byte)0x02,
(byte)0x04,
(byte)0x08,
(byte)0x10,
(byte)0x20,
(byte)0x40,
(byte)0x80
};
/** The bit vector. */
BitSet bits;
/** Default constructor - use with readFields */
public BloomFilter() {
super();
}
/**
* Constructor
* @param vectorSize The vector size of <i>this</i> filter.
* @param nbHash The number of hash function to consider.
* @param hashType type of the hashing function (see {@link Hash}).
*/
public BloomFilter(int vectorSize, int nbHash, int hashType){
super(vectorSize, nbHash, hashType);
bits = new BitSet(this.vectorSize);
}//end constructor
@Override
public void add(Key key) {
if(key == null) {
throw new NullPointerException("key cannot be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
bits.set(h[i]);
}
}//end add()
@Override
public void and(Filter filter){
if(filter == null
|| !(filter instanceof BloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be and-ed");
}
this.bits.and(((BloomFilter) filter).bits);
}//end and()
@Override
public boolean membershipTest(Key key){
if(key == null) {
throw new NullPointerException("key cannot be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
if(!bits.get(h[i])) {
return false;
}
}
return true;
}//end memberhsipTest()
@Override
public void not(){
bits.flip(0, vectorSize - 1);
}//end not()
@Override
public void or(Filter filter){
if(filter == null
|| !(filter instanceof BloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be or-ed");
}
bits.or(((BloomFilter) filter).bits);
}//end or()
@Override
public void xor(Filter filter){
if(filter == null
|| !(filter instanceof BloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be xor-ed");
}
bits.xor(((BloomFilter) filter).bits);
}//and xor()
@Override
public String toString(){
return bits.toString();
}//end toString()
@Override
public Object clone(){
BloomFilter bf = new BloomFilter(vectorSize, nbHash, hashType);
bf.or(this);
return bf;
}//end clone()
/**
* @return size of the the bloomfilter
*/
public int getVectorSize() {
return this.vectorSize;
}
// Writable
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
byte[] bytes = new byte[getNBytes()];
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
if (bitIndex == 8) {
bitIndex = 0;
byteIndex++;
}
if (bitIndex == 0) {
bytes[byteIndex] = 0;
}
if (bits.get(i)) {
bytes[byteIndex] |= bitvalues[bitIndex];
}
}
out.write(bytes);
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
bits = new BitSet(this.vectorSize);
byte[] bytes = new byte[getNBytes()];
in.readFully(bytes);
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
if (bitIndex == 8) {
bitIndex = 0;
byteIndex++;
}
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
bits.set(i);
}
}
}
/* @return number of bytes needed to hold bit vector */
private int getNBytes() {
return (vectorSize + 7) / 8;
}
}//end class

View File

@ -1,309 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
* 2000 paper.
* <p>
* A counting Bloom filter is an improvement to standard a Bloom filter as it
* allows dynamic additions and deletions of set membership information. This
* is achieved through the use of a counting vector instead of a bit vector.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.1 - 19 Jan. 08
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
*/
public final class CountingBloomFilter extends Filter {
/** Storage for the counting buckets */
private long[] buckets;
/** We are using 4bit buckets, so each bucket can count to 15 */
private final static long BUCKET_MAX_VALUE = 15;
/** Default constructor - use with readFields */
public CountingBloomFilter() {}
/**
* Constructor
* @param vectorSize The vector size of <i>this</i> filter.
* @param nbHash The number of hash function to consider.
* @param hashType type of the hashing function (see {@link Hash}).
*/
public CountingBloomFilter(int vectorSize, int nbHash, int hashType){
super(vectorSize, nbHash, hashType);
buckets = new long[buckets2words(vectorSize)];
}//end constructor
/** returns the number of 64 bit words it would take to hold vectorSize buckets */
private static int buckets2words(int vectorSize) {
return ((vectorSize - 1) >>> 4) + 1;
}
@Override
public void add(Key key) {
if(key == null) {
throw new NullPointerException("key can not be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
// only increment if the count in the bucket is less than BUCKET_MAX_VALUE
if(bucketValue < BUCKET_MAX_VALUE) {
// increment by 1
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
}
}
}//end add()
/**
* Removes a specified key from <i>this</i> counting Bloom filter.
* <p>
* <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter.
* @param key The key to remove.
*/
public void delete(Key key) {
if(key == null) {
throw new NullPointerException("Key may not be null");
}
if(!membershipTest(key)) {
throw new IllegalArgumentException("Key is not a member");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
// only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
// decrement by 1
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
}
}
}//end delete
@Override
public void and(Filter filter){
if(filter == null
|| !(filter instanceof CountingBloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be and-ed");
}
CountingBloomFilter cbf = (CountingBloomFilter)filter;
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
this.buckets[i] &= cbf.buckets[i];
}
}//end and()
@Override
public boolean membershipTest(Key key){
if(key == null) {
throw new NullPointerException("Key may not be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
if((buckets[wordNum] & bucketMask) == 0) {
return false;
}
}
return true;
}//end membershipTest()
/**
* This method calculates an approximate count of the key, i.e. how many
* times the key was added to the filter. This allows the filter to be
* used as an approximate <code>key -&gt; count</code> map.
* <p>NOTE: due to the bucket size of this filter, inserting the same
* key more than 15 times will cause an overflow at all filter positions
* associated with this key, and it will significantly increase the error
* rate for this and other keys. For this reason the filter can only be
* used to store small count values <code>0 &lt;= N &lt;&lt; 15</code>.
* @param key key to be tested
* @return 0 if the key is not present. Otherwise, a positive value v will
* be returned such that <code>v == count</code> with probability equal to the
* error rate of this filter, and <code>v &gt; count</code> otherwise.
* Additionally, if the filter experienced an underflow as a result of
* {@link #delete(Key)} operation, the return value may be lower than the
* <code>count</code> with the probability of the false negative rate of such
* filter.
*/
public int approximateCount(Key key) {
int res = Integer.MAX_VALUE;
int[] h = hash.hash(key);
hash.clear();
for (int i = 0; i < nbHash; i++) {
// find the bucket
int wordNum = h[i] >> 4; // div 16
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
if (bucketValue < res) res = (int)bucketValue;
}
if (res != Integer.MAX_VALUE) {
return res;
}
return 0;
}
@Override
public void not(){
throw new UnsupportedOperationException("not() is undefined for "
+ this.getClass().getName());
}//end not()
@Override
public void or(Filter filter){
if(filter == null
|| !(filter instanceof CountingBloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be or-ed");
}
CountingBloomFilter cbf = (CountingBloomFilter)filter;
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
this.buckets[i] |= cbf.buckets[i];
}
}//end or()
@Override
public void xor(Filter filter){
throw new UnsupportedOperationException("xor() is undefined for "
+ this.getClass().getName());
}//end xor()
@Override
public String toString(){
StringBuilder res = new StringBuilder();
for(int i = 0; i < vectorSize; i++) {
if(i > 0) {
res.append(" ");
}
int wordNum = i >> 4; // div 16
int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
long bucketMask = 15L << bucketShift;
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
res.append(bucketValue);
}
return res.toString();
}//end toString()
@Override
public Object clone(){
CountingBloomFilter cbf = new CountingBloomFilter(vectorSize, nbHash, hashType);
cbf.buckets = this.buckets.clone();
return cbf;
}
// Writable
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
int sizeInWords = buckets2words(vectorSize);
for(int i = 0; i < sizeInWords; i++) {
out.writeLong(buckets[i]);
}
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
int sizeInWords = buckets2words(vectorSize);
buckets = new long[sizeInWords];
for(int i = 0; i < sizeInWords; i++) {
buckets[i] = in.readLong();
}
}
}

View File

@ -1,301 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* Implements a <i>dynamic Bloom filter</i>, as defined in the INFOCOM 2006 paper.
* <p>
* A dynamic Bloom filter (DBF) makes use of a <code>s * m</code> bit matrix but
* each of the <code>s</code> rows is a standard Bloom filter. The creation
* process of a DBF is iterative. At the start, the DBF is a <code>1 * m</code>
* bit matrix, i.e., it is composed of a single standard Bloom filter.
* It assumes that <code>n<sub>r</sub></code> elements are recorded in the
* initial bit vector, where <code>n<sub>r</sub> <= n</code> (<code>n</code> is
* the cardinality of the set <code>A</code> to record in the filter).
* <p>
* As the size of <code>A</code> grows during the execution of the application,
* several keys must be inserted in the DBF. When inserting a key into the DBF,
* one must first get an active Bloom filter in the matrix. A Bloom filter is
* active when the number of recorded keys, <code>n<sub>r</sub></code>, is
* strictly less than the current cardinality of <code>A</code>, <code>n</code>.
* If an active Bloom filter is found, the key is inserted and
* <code>n<sub>r</sub></code> is incremented by one. On the other hand, if there
* is no active Bloom filter, a new one is created (i.e., a new row is added to
* the matrix) according to the current size of <code>A</code> and the element
* is added in this new Bloom filter and the <code>n<sub>r</sub></code> value of
* this new Bloom filter is set to one. A given key is said to belong to the
* DBF if the <code>k</code> positions are set to one in one of the matrix rows.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 6 Feb. 07
*
* @see org.onelab.filter.Filter The general behavior of a filter
* @see org.onelab.filter.BloomFilter A Bloom filter
*
* @see <a href="http://www.cse.fau.edu/~jie/research/publications/Publication_files/infocom2006.pdf">Theory and Network Applications of Dynamic Bloom Filters</a>
*/
public class DynamicBloomFilter extends Filter {
/**
* Threshold for the maximum number of key to record in a dynamic Bloom filter row.
*/
private int nr;
/**
* The number of keys recorded in the current standard active Bloom filter.
*/
private int currentNbRecord;
/**
* The matrix of Bloom filter.
*/
private BloomFilter[] matrix;
/**
* Zero-args constructor for the serialization.
*/
public DynamicBloomFilter() { }
/**
* Constructor.
* <p>
* Builds an empty Dynamic Bloom filter.
* @param vectorSize The number of bits in the vector.
* @param nbHash The number of hash function to consider.
* @param hashType type of the hashing function (see {@link Hash}).
* @param nr The threshold for the maximum number of keys to record in a dynamic Bloom filter row.
*/
public DynamicBloomFilter(int vectorSize, int nbHash, int hashType, int nr) {
super(vectorSize, nbHash, hashType);
this.nr = nr;
this.currentNbRecord = 0;
matrix = new BloomFilter[1];
matrix[0] = new BloomFilter(this.vectorSize, this.nbHash, this.hashType);
}//end constructor
@Override
public void add(Key key){
if(key == null) {
throw new NullPointerException("Key can not be null");
}
BloomFilter bf = getActiveStandardBF();
if(bf == null){
addRow();
bf = matrix[matrix.length - 1];
currentNbRecord = 0;
}
bf.add(key);
currentNbRecord++;
}//end add()
@Override
public void and(Filter filter) {
if(filter == null
|| !(filter instanceof DynamicBloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be and-ed");
}
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
throw new IllegalArgumentException("filters cannot be and-ed");
}
for(int i = 0; i < matrix.length; i++) {
matrix[i].and(dbf.matrix[i]);
}
}//end and()
@Override
public boolean membershipTest(Key key){
if(key == null) {
return true;
}
for(int i = 0; i < matrix.length; i++) {
if(matrix[i].membershipTest(key)) {
return true;
}
}
return false;
}//end membershipTest()
@Override
public void not(){
for(int i = 0; i < matrix.length; i++) {
matrix[i].not();
}
}//end not()
@Override
public void or(Filter filter){
if(filter == null
|| !(filter instanceof DynamicBloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be or-ed");
}
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
throw new IllegalArgumentException("filters cannot be or-ed");
}
for(int i = 0; i < matrix.length; i++) {
matrix[i].or(dbf.matrix[i]);
}
}//end or()
@Override
public void xor(Filter filter){
if(filter == null
|| !(filter instanceof DynamicBloomFilter)
|| filter.vectorSize != this.vectorSize
|| filter.nbHash != this.nbHash) {
throw new IllegalArgumentException("filters cannot be xor-ed");
}
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
throw new IllegalArgumentException("filters cannot be xor-ed");
}
for(int i = 0; i<matrix.length; i++) {
matrix[i].xor(dbf.matrix[i]);
}
}//end xor()
@Override
public String toString(){
StringBuilder res = new StringBuilder();
for(int i=0; i<matrix.length; i++) {
res.append(matrix[i]);
res.append(Character.LINE_SEPARATOR);
}
return res.toString();
}//end toString()
@Override
public Object clone(){
DynamicBloomFilter dbf = new DynamicBloomFilter(vectorSize, nbHash, hashType, nr);
dbf.currentNbRecord = this.currentNbRecord;
dbf.matrix = new BloomFilter[this.matrix.length];
for(int i = 0; i < this.matrix.length; i++) {
dbf.matrix[i] = (BloomFilter)this.matrix[i].clone();
}
return dbf;
}//end clone()
// Writable
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
out.writeInt(nr);
out.writeInt(currentNbRecord);
out.writeInt(matrix.length);
for (int i = 0; i < matrix.length; i++) {
matrix[i].write(out);
}
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
nr = in.readInt();
currentNbRecord = in.readInt();
int len = in.readInt();
matrix = new BloomFilter[len];
for (int i = 0; i < matrix.length; i++) {
matrix[i] = new BloomFilter();
matrix[i].readFields(in);
}
}
/**
* Adds a new row to <i>this</i> dynamic Bloom filter.
*/
private void addRow(){
BloomFilter[] tmp = new BloomFilter[matrix.length + 1];
for(int i = 0; i < matrix.length; i++) {
tmp[i] = (BloomFilter)matrix[i].clone();
}
tmp[tmp.length-1] = new BloomFilter(vectorSize, nbHash, hashType);
matrix = tmp;
}//end addRow()
/**
* Returns the active standard Bloom filter in <i>this</i> dynamic Bloom filter.
* @return BloomFilter The active standard Bloom filter.
* <code>Null</code> otherwise.
*/
private BloomFilter getActiveStandardBF() {
if(currentNbRecord >= nr) {
return null;
}
return matrix[matrix.length - 1];
}//end getActiveStandardBF()
}//end class

View File

@ -1,216 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819
* (http://www.one-lab.org)
*
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import org.apache.hadoop.hbase.util.Hash;
import org.apache.hadoop.io.Writable;
/**
* Defines the general behavior of a filter.
* <p>
* A filter is a data structure which aims at offering a lossy summary of a set <code>A</code>. The
* key idea is to map entries of <code>A</code> (also called <i>keys</i>) into several positions
* in a vector through the use of several hash functions.
* <p>
* Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension).
* <p>
* It must be extended in order to define the real behavior.
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @version 1.0 - 2 Feb. 07
*
* @see org.onelab.filter.Key The general behavior of a key
* @see org.onelab.filter.HashFunction A hash function
*/
public abstract class Filter implements Writable {
private static final int VERSION = -1; // negative to accommodate for old format
/** The vector size of <i>this</i> filter. */
protected int vectorSize;
/** The hash function used to map a key to several positions in the vector. */
protected HashFunction hash;
/** The number of hash function to consider. */
protected int nbHash;
/** Type of hashing function to use. */
protected int hashType;
protected Filter() {}
/**
* Constructor.
* @param vectorSize The vector size of <i>this</i> filter.
* @param nbHash The number of hash functions to consider.
* @param hashType type of the hashing function (see {@link Hash}).
*/
protected Filter(int vectorSize, int nbHash, int hashType) {
this.vectorSize = vectorSize;
this.nbHash = nbHash;
this.hashType = hashType;
this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType);
}//end constructor
/**
* Adds a key to <i>this</i> filter.
* @param key The key to add.
*/
public abstract void add(Key key);
/**
* Determines wether a specified key belongs to <i>this</i> filter.
* @param key The key to test.
* @return boolean True if the specified key belongs to <i>this</i> filter.
* False otherwise.
*/
public abstract boolean membershipTest(Key key);
/**
* Peforms a logical AND between <i>this</i> filter and a specified filter.
* <p>
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
* @param filter The filter to AND with.
*/
public abstract void and(Filter filter);
/**
* Peforms a logical OR between <i>this</i> filter and a specified filter.
* <p>
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
* @param filter The filter to OR with.
*/
public abstract void or(Filter filter);
/**
* Peforms a logical XOR between <i>this</i> filter and a specified filter.
* <p>
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
* @param filter The filter to XOR with.
*/
public abstract void xor(Filter filter);
/**
* Performs a logical NOT on <i>this</i> filter.
* <p>
* The result is assigned to <i>this</i> filter.
*/
public abstract void not();
/**
* Adds a list of keys to <i>this</i> filter.
* @param keys The list of keys.
*/
public void add(List<Key> keys){
if(keys == null) {
throw new IllegalArgumentException("ArrayList<Key> may not be null");
}
for(Key key: keys) {
add(key);
}
}//end add()
/**
* Adds a collection of keys to <i>this</i> filter.
* @param keys The collection of keys.
*/
public void add(Collection<Key> keys){
if(keys == null) {
throw new IllegalArgumentException("Collection<Key> may not be null");
}
for(Key key: keys) {
add(key);
}
}//end add()
/**
* Adds an array of keys to <i>this</i> filter.
* @param keys The array of keys.
*/
public void add(Key[] keys){
if(keys == null) {
throw new IllegalArgumentException("Key[] may not be null");
}
for(int i = 0; i < keys.length; i++) {
add(keys[i]);
}
}//end add()
// Writable interface
public void write(DataOutput out) throws IOException {
out.writeInt(VERSION);
out.writeInt(this.nbHash);
out.writeByte(this.hashType);
out.writeInt(this.vectorSize);
}
public void readFields(DataInput in) throws IOException {
int ver = in.readInt();
if (ver > 0) { // old unversioned format
this.nbHash = ver;
this.hashType = Hash.JENKINS_HASH;
} else if (ver == VERSION) {
this.nbHash = in.readInt();
this.hashType = in.readByte();
} else {
throw new IOException("Unsupported version: " + ver);
}
this.vectorSize = in.readInt();
this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType);
}
}//end class

View File

@ -1,127 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819
* (http://www.one-lab.org)
*
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import org.apache.hadoop.hbase.util.Hash;
/**
* Implements a hash object that returns a certain number of hashed values.
* <p>
* It is based on the SHA-1 algorithm.
*
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @version 1.0 - 2 Feb. 07
*
* @see org.onelab.filter.Key The general behavior of a key being stored in a filter
* @see org.onelab.filter.Filter The general behavior of a filter
*
* @see <a href="http://www.itl.nist.gov/fipspubs/fip180-1.htm">SHA-1 algorithm</a>
*/
public final class HashFunction {
/** The number of hashed values. */
private int nbHash;
/** The maximum highest returned value. */
private int maxValue;
/** Hashing algorithm to use. */
private Hash hashFunction;
/**
* Constructor.
* <p>
* Builds a hash function that must obey to a given maximum number of returned values and a highest value.
* @param maxValue The maximum highest returned value.
* @param nbHash The number of resulting hashed values.
* @param hashType type of the hashing function (see {@link Hash}).
*/
public HashFunction(int maxValue, int nbHash, int hashType) {
if(maxValue <= 0) {
throw new IllegalArgumentException("maxValue must be > 0");
}
if(nbHash <= 0) {
throw new IllegalArgumentException("nbHash must be > 0");
}
this.maxValue = maxValue;
this.nbHash = nbHash;
this.hashFunction = Hash.getInstance(hashType);
if (this.hashFunction == null)
throw new IllegalArgumentException("hashType must be known");
}//end constructor
/** Clears <i>this</i> hash function. A NOOP */
public void clear() {
}
/**
* Hashes a specified key into several integers.
* @param k The specified key.
* @return The array of hashed values.
*/
public int[] hash(Key k){
byte[] b = k.getBytes();
if(b == null) {
throw new NullPointerException("buffer reference is null");
}
if(b.length == 0) {
throw new IllegalArgumentException("key length must be > 0");
}
int[] result = new int[nbHash];
for (int i = 0, initval = 0; i < nbHash; i++) {
initval = hashFunction.hash(b, initval);
result[i] = Math.abs(initval) % maxValue;
}
return result;
}//end hash()
}//end class

View File

@ -1,174 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* The general behavior of a key that must be stored in a filter.
*
* @see org.onelab.filter.Filter The general behavior of a filter
*/
public class Key implements WritableComparable<Key> {
/** Byte value of key */
byte[] bytes;
/**
* The weight associated to <i>this</i> key.
* <p>
* <b>Invariant</b>: if it is not specified, each instance of
* <code>Key</code> will have a default weight of 1.0
*/
double weight;
/** default constructor - use with readFields */
public Key() {}
/**
* Constructor.
* <p>
* Builds a key with a default weight.
* @param value The byte value of <i>this</i> key.
*/
public Key(byte[] value) {
this(value, 1.0);
}//end constructor
/**
* Constructor.
* <p>
* Builds a key with a specified weight.
* @param value The value of <i>this</i> key.
* @param weight The weight associated to <i>this</i> key.
*/
public Key(byte[] value, double weight) {
set(value, weight);
}//end constructor
/**
* @param value
* @param weight
*/
public void set(byte[] value, double weight) {
if(value == null) {
throw new IllegalArgumentException("value can not be null");
}
this.bytes = value;
this.weight = weight;
}
/** @return byte[] The value of <i>this</i> key. */
public byte[] getBytes() {
return this.bytes;
}
/** @return Returns the weight associated to <i>this</i> key. */
public double getWeight(){
return weight;
}//end getWeight()
/**
* Increments the weight of <i>this</i> key with a specified value.
* @param weight The increment.
*/
public void incrementWeight(double weight){
this.weight += weight;
}//end incrementWeight()
/** Increments the weight of <i>this</i> key by one. */
public void incrementWeight(){
this.weight++;
}//end incrementWeight()
@Override
public boolean equals(Object o) {
return this.compareTo((Key)o) == 0;
}
@Override
public int hashCode() {
int result = 0;
for(int i = 0; i < bytes.length; i++) {
result ^= Byte.valueOf(bytes[i]).hashCode();
}
result ^= Double.valueOf(weight).hashCode();
return result;
}
// Writable
public void write(DataOutput out) throws IOException {
out.writeInt(bytes.length);
out.write(bytes);
out.writeDouble(weight);
}
public void readFields(DataInput in) throws IOException {
this.bytes = new byte[in.readInt()];
in.readFully(this.bytes);
weight = in.readDouble();
}
// Comparable
public int compareTo(Key o) {
int result = this.bytes.length - o.getBytes().length;
for(int i = 0; result == 0 && i < bytes.length; i++) {
result = this.bytes[i] - o.bytes[i];
}
if(result == 0) {
result = Double.valueOf(this.weight - o.weight).intValue();
}
return result;
}
}//end class

View File

@ -1,91 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819
* (http://www.one-lab.org)
*
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
/**
* Defines the different remove scheme for retouched Bloom filters.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 7 Feb. 07
*/
public interface RemoveScheme {
/**
* Random selection.
* <p>
* The idea is to randomly select a bit to reset.
*/
public final static short RANDOM = 0;
/**
* MinimumFN Selection.
* <p>
* The idea is to select the bit to reset that will generate the minimum
* number of false negative.
*/
public final static short MINIMUM_FN = 1;
/**
* MaximumFP Selection.
* <p>
* The idea is to select the bit to reset that will remove the maximum number
* of false positive.
*/
public final static short MAXIMUM_FP = 2;
/**
* Ratio Selection.
* <p>
* The idea is to select the bit to reset that will, at the same time, remove
* the maximum number of false positve while minimizing the amount of false
* negative generated.
*/
public final static short RATIO = 3;
}//end interface

View File

@ -1,448 +0,0 @@
/**
*
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.filter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Random;
/**
* Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
* <p>
* It allows the removal of selected false positives at the cost of introducing
* random false negatives, and with the benefit of eliminating some random false
* positives at the same time.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 7 Feb. 07
*
* @see org.onelab.filter.Filter The general behavior of a filter
* @see org.onelab.filter.BloomFilter A Bloom filter
* @see org.onelab.filter.RemoveScheme The different selective clearing algorithms
*
* @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a>
*/
public final class RetouchedBloomFilter extends BloomFilter
implements RemoveScheme {
/**
* KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
*/
List<Key>[] fpVector;
/**
* KeyList vector of keys recorded in the filter.
*/
List<Key>[] keyVector;
/**
* Ratio vector.
*/
double[] ratio;
private Random rand;
/** Default constructor - use with readFields */
public RetouchedBloomFilter() {}
/**
* Constructor
* @param vectorSize The vector size of <i>this</i> filter.
* @param nbHash The number of hash function to consider.
* @param hashType type of the hashing function (see {@link Hash}).
*/
public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) {
super(vectorSize, nbHash, hashType);
this.rand = null;
createVector();
}//end constructor
@Override
public void add(Key key){
if(key == null) {
throw new NullPointerException("key can not be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
bits.set(h[i]);
keyVector[h[i]].add(key);
}//end for - i
}//end add()
/**
* Adds a false positive information to <i>this</i> retouched Bloom filter.
* <p>
* <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens.
* @param key The false positive key to add.
*/
public void addFalsePositive(Key key){
if(key == null) {
throw new NullPointerException("key can not be null");
}
int[] h = hash.hash(key);
hash.clear();
for(int i = 0; i < nbHash; i++) {
fpVector[h[i]].add(key);
}
}//end addFalsePositive()
/**
* Adds a collection of false positive information to <i>this</i> retouched Bloom filter.
* @param coll The collection of false positive.
*/
public void addFalsePositive(Collection<Key> coll) {
if(coll == null) {
throw new NullPointerException("Collection<Key> can not be null");
}
for(Key k: coll) {
addFalsePositive(k);
}
}//end addFalsePositive()
/**
* Adds a list of false positive information to <i>this</i> retouched Bloom filter.
* @param keys The list of false positive.
*/
public void addFalsePositive(List<Key> keys){
if(keys == null) {
throw new NullPointerException("ArrayList<Key> can not be null");
}
for(Key k: keys) {
addFalsePositive(k);
}
}//end addFalsePositive()
/**
* Adds an array of false positive information to <i>this</i> retouched Bloom filter.
* @param keys The array of false positive.
*/
public void addFalsePositive(Key[] keys){
if(keys == null) {
throw new NullPointerException("Key[] can not be null");
}
for(int i = 0; i < keys.length; i++) {
addFalsePositive(keys[i]);
}
}//end addFalsePositive()
/**
* Performs the selective clearing for a given key.
* @param k The false positive key to remove from <i>this</i> retouched Bloom filter.
* @param scheme The selective clearing scheme to apply.
*/
public void selectiveClearing(Key k, short scheme) {
if(k == null) {
throw new NullPointerException("Key can not be null");
}
if(!membershipTest(k)) {
throw new IllegalArgumentException("Key is not a member");
}
int index = 0;
int[] h = hash.hash(k);
switch(scheme) {
case RANDOM:
index = randomRemove();
break;
case MINIMUM_FN:
index = minimumFnRemove(h);
break;
case MAXIMUM_FP:
index = maximumFpRemove(h);
break;
case RATIO:
index = ratioRemove(h);
break;
default:
throw new AssertionError("Undefined selective clearing scheme");
}//end switch
clearBit(index);
}//end selectiveClearing()
private int randomRemove() {
if(rand == null) {
rand = new Random();
}
return rand.nextInt(nbHash);
}//end randomRemove()
/**
* Chooses the bit position that minimizes the number of false negative generated.
* @param h The different bit positions.
* @return int The position that minimizes the number of false negative generated.
*/
private int minimumFnRemove(int[] h) {
int minIndex = Integer.MAX_VALUE;
double minValue = Double.MAX_VALUE;
for(int i = 0; i < nbHash; i++) {
double keyWeight = getWeight(keyVector[h[i]]);
if(keyWeight < minValue) {
minIndex = h[i];
minValue = keyWeight;
}
}//end for - i
return minIndex;
}//end minimumFnRemove()
/**
* Chooses the bit position that maximizes the number of false positive removed.
* @param h The different bit positions.
* @return int The position that maximizes the number of false positive removed.
*/
private int maximumFpRemove(int[] h){
int maxIndex = Integer.MIN_VALUE;
double maxValue = Double.MIN_VALUE;
for(int i = 0; i < nbHash; i++) {
double fpWeight = getWeight(fpVector[h[i]]);
if(fpWeight > maxValue) {
maxValue = fpWeight;
maxIndex = h[i];
}
}
return maxIndex;
}//end maximumFpRemove()
/**
* Chooses the bit position that minimizes the number of false negative generated while maximizing.
* the number of false positive removed.
* @param h The different bit positions.
* @return int The position that minimizes the number of false negative generated while maximizing.
*/
private int ratioRemove(int[] h){
computeRatio();
int minIndex = Integer.MAX_VALUE;
double minValue = Double.MAX_VALUE;
for(int i = 0; i < nbHash; i++) {
if(ratio[h[i]] < minValue) {
minValue = ratio[h[i]];
minIndex = h[i];
}
}//end for - i
return minIndex;
}//end ratioRemove()
/**
* Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors.
* @param index The position of the bit to clear.
*/
private void clearBit(int index){
if(index < 0 || index >= vectorSize) {
throw new ArrayIndexOutOfBoundsException(index);
}
List<Key> kl = keyVector[index];
List<Key> fpl = fpVector[index];
// update key list
int listSize = kl.size();
for(int i = 0; i < listSize && !kl.isEmpty(); i++) {
removeKey(kl.get(0), keyVector);
}
kl.clear();
keyVector[index].clear();
//update false positive list
listSize = fpl.size();
for(int i = 0; i < listSize && !fpl.isEmpty(); i++) {
removeKey(fpl.get(0), fpVector);
}
fpl.clear();
fpVector[index].clear();
//update ratio
ratio[index] = 0.0;
//update bit vector
bits.clear(index);
}//end clearBit()
/**
* Removes a given key from <i>this</i> filer.
* @param k The key to remove.
* @param vector The counting vector associated to the key.
*/
private void removeKey(Key k, List<Key>[] vector) {
if(k == null) {
throw new NullPointerException("Key can not be null");
}
if(vector == null) {
throw new NullPointerException("ArrayList<Key>[] can not be null");
}
int[] h = hash.hash(k);
hash.clear();
for(int i = 0; i < nbHash; i++) {
vector[h[i]].remove(k);
}
}//end removeKey()
/**
* Computes the ratio A/FP.
*/
private void computeRatio() {
for(int i = 0; i < vectorSize; i++) {
double keyWeight = getWeight(keyVector[i]);
double fpWeight = getWeight(fpVector[i]);
if(keyWeight > 0 && fpWeight > 0) {
ratio[i] = keyWeight/fpWeight;
}
}//end for - i
}//end computeRatio()
private double getWeight(List<Key> keyList) {
double weight = 0.0;
for(Key k: keyList) {
weight += k.getWeight();
}
return weight;
}
/**
* Creates and initialises the various vectors.
*/
@SuppressWarnings("unchecked")
private void createVector() {
fpVector = new List[vectorSize];
keyVector = new List[vectorSize];
ratio = new double[vectorSize];
for(int i = 0; i < vectorSize; i++) {
fpVector[i] = Collections.synchronizedList(new ArrayList<Key>());
keyVector[i] = Collections.synchronizedList(new ArrayList<Key>());
ratio[i] = 0.0;
}//end for -i
}//end createVector()
// Writable
@Override
public void write(DataOutput out) throws IOException {
super.write(out);
for(int i = 0; i < fpVector.length; i++) {
List<Key> list = fpVector[i];
out.writeInt(list.size());
for(Key k: list) {
k.write(out);
}
}
for(int i = 0; i < keyVector.length; i++) {
List<Key> list = keyVector[i];
out.writeInt(list.size());
for(Key k: list) {
k.write(out);
}
}
for(int i = 0; i < ratio.length; i++) {
out.writeDouble(ratio[i]);
}
}
@Override
public void readFields(DataInput in) throws IOException {
super.readFields(in);
createVector();
for(int i = 0; i < fpVector.length; i++) {
List<Key> list = fpVector[i];
int size = in.readInt();
for(int j = 0; j < size; j++) {
Key k = new Key();
k.readFields(in);
list.add(k);
}
}
for(int i = 0; i < keyVector.length; i++) {
List<Key> list = keyVector[i];
int size = in.readInt();
for(int j = 0; j < size; j++) {
Key k = new Key();
k.readFields(in);
list.add(k);
}
}
for(int i = 0; i < ratio.length; i++) {
ratio[i] = in.readDouble();
}
}
}//end class

View File

@ -1,94 +0,0 @@
/**
* Copyright (c) 2005, European Commission project OneLab under contract 034819
* (http://www.one-lab.org)
*
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.test;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.hbase.HConstants;
import org.onelab.filter.Key;
/**
* Test class for keys.
* <p>
* It gives an example on how to extend Key.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 5 Feb. 07
*
* @see org.onelab.filter.Key A key stored in a filter
*/
public class StringKey extends Key {
/** Default constructor - use with readFields */
public StringKey() {}
/**
* Construct a Key using the specified String and default weight
*
* @param key String key value
* @throws UnsupportedEncodingException
*/
public StringKey(String key) throws UnsupportedEncodingException {
super(key.getBytes(HConstants.UTF8_ENCODING));
}
/**
* Construct a Key using the specified string and weight
*
* @param key - String key value
* @param weight key weight
* @throws UnsupportedEncodingException
*/
public StringKey(String key, double weight)
throws UnsupportedEncodingException {
super(key.getBytes(HConstants.UTF8_ENCODING), weight);
}
}

View File

@ -1,323 +0,0 @@
/**
* Copyright (c) 2005, European Commission project OneLab under contract 034819
* (http://www.one-lab.org)
*
* All rights reserved.
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
* - Neither the name of the University Catholique de Louvain - UCL
* nor the names of its contributors may be used to endorse or
* promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onelab.test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.util.Hash;
import org.onelab.filter.*;
/**
* Test class.
*
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
*
* @version 1.0 - 8 Feb. 07
*/
public class TestFilter extends TestCase {
private static final Log LOG = LogFactory.getLog(TestFilter.class);
/** Test a BloomFilter
* @throws UnsupportedEncodingException
* @throws IOException
*/
public void testBloomFilter() throws UnsupportedEncodingException,
IOException {
final StringKey[] inserted = {
new StringKey("wmjwjzyv"),
new StringKey("baietibz"),
new StringKey("guhsgxnv"),
new StringKey("mhnqycto"),
new StringKey("xcyqafgz"),
new StringKey("zidoamgb"),
new StringKey("tftfirzd"),
new StringKey("okapqlrg"),
new StringKey("yccwzwsq"),
new StringKey("qmonufqu"),
new StringKey("wlsctews"),
new StringKey("mksdhqri"),
new StringKey("wxxllokj"),
new StringKey("eviuqpls"),
new StringKey("bavotqmj"),
new StringKey("yibqzhdl"),
new StringKey("csfqmsyr"),
new StringKey("guxliyuh"),
new StringKey("pzicietj"),
new StringKey("qdwgrqwo"),
new StringKey("ujfzecmi"),
new StringKey("dzeqfvfi"),
new StringKey("phoegsij"),
new StringKey("bvudfcou"),
new StringKey("dowzmciz"),
new StringKey("etvhkizp"),
new StringKey("rzurqycg"),
new StringKey("krqfxuge"),
new StringKey("gflcohtd"),
new StringKey("fcrcxtps"),
new StringKey("qrtovxdq"),
new StringKey("aypxwrwi"),
new StringKey("dckpyznr"),
new StringKey("mdaawnpz"),
new StringKey("pakdfvca"),
new StringKey("xjglfbez"),
new StringKey("xdsecofi"),
new StringKey("sjlrfcab"),
new StringKey("ebcjawxv"),
new StringKey("hkafkjmy"),
new StringKey("oimmwaxo"),
new StringKey("qcuzrazo"),
new StringKey("nqydfkwk"),
new StringKey("frybvmlb"),
new StringKey("amxmaqws"),
new StringKey("gtkovkgx"),
new StringKey("vgwxrwss"),
new StringKey("xrhzmcep"),
new StringKey("tafwziil"),
new StringKey("erjmncnv"),
new StringKey("heyzqzrn"),
new StringKey("sowvyhtu"),
new StringKey("heeixgzy"),
new StringKey("ktcahcob"),
new StringKey("ljhbybgg"),
new StringKey("jiqfcksl"),
new StringKey("anjdkjhm"),
new StringKey("uzcgcuxp"),
new StringKey("vzdhjqla"),
new StringKey("svhgwwzq"),
new StringKey("zhswvhbp"),
new StringKey("ueceybwy"),
new StringKey("czkqykcw"),
new StringKey("ctisayir"),
new StringKey("hppbgciu"),
new StringKey("nhzgljfk"),
new StringKey("vaziqllf"),
new StringKey("narvrrij"),
new StringKey("kcevbbqi"),
new StringKey("qymuaqnp"),
new StringKey("pwqpfhsr"),
new StringKey("peyeicuk"),
new StringKey("kudlwihi"),
new StringKey("pkmqejlm"),
new StringKey("ylwzjftl"),
new StringKey("rhqrlqar"),
new StringKey("xmftvzsp"),
new StringKey("iaemtihk"),
new StringKey("ymsbrqcu"),
new StringKey("yfnlcxto"),
new StringKey("nluqopqh"),
new StringKey("wmrzhtox"),
new StringKey("qnffhqbl"),
new StringKey("zypqpnbw"),
new StringKey("oiokhatd"),
new StringKey("mdraddiu"),
new StringKey("zqoatltt"),
new StringKey("ewhulbtm"),
new StringKey("nmswpsdf"),
new StringKey("xsjeteqe"),
new StringKey("ufubcbma"),
new StringKey("phyxvrds"),
new StringKey("vhnfldap"),
new StringKey("zrrlycmg"),
new StringKey("becotcjx"),
new StringKey("wvbubokn"),
new StringKey("avkgiopr"),
new StringKey("mbqqxmrv"),
new StringKey("ibplgvuu"),
new StringKey("dghvpkgc")
};
final StringKey[] notInserted = {
new StringKey("abcdefgh"),
new StringKey("ijklmnop"),
new StringKey("qrstuvwx"),
new StringKey("yzabcdef")
};
/*
* Bloom filters are very sensitive to the number of elements inserted into
* them.
*
* If m denotes the number of bits in the Bloom filter (vectorSize),
* n denotes the number of elements inserted into the Bloom filter and
* k represents the number of hash functions used (nbHash), then
* according to Broder and Mitzenmacher,
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
*
* the probability of false positives is minimized when k is
* approximately ln(2) * m/n.
*
* If we fix the number of hash functions and know the number of entries,
* then the optimal vector size m = (k * n) / ln(2)
*/
final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
BloomFilter bf = new BloomFilter(
(int) Math.ceil(
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
Math.log(2.0)),
DEFAULT_NUMBER_OF_HASH_FUNCTIONS,
Hash.JENKINS_HASH
);
for (int i = 0; i < inserted.length; i++) {
bf.add(inserted[i]);
}
// Verify that there are no false negatives and few (if any) false positives
checkFalsePositivesNegatives(bf, inserted, notInserted);
// Test serialization/deserialization
LOG.info("Checking serialization/deserialization");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutputStream out = new DataOutputStream(baos);
bf.write(out);
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
DataInputStream in = new DataInputStream(bais);
bf = new BloomFilter();
bf.readFields(in);
// Verify that there are no false negatives and few (if any) false positives
checkFalsePositivesNegatives(bf, inserted, notInserted);
}
private void checkFalsePositivesNegatives(BloomFilter bf,
StringKey[] inserted, StringKey[] notInserted) {
// Test membership for values we inserted. Should not get false negatives
LOG.info("Checking for false negatives");
for (int i = 0; i < inserted.length; i++) {
if (!bf.membershipTest(inserted[i])) {
LOG.error("false negative for: " + inserted[i]);
fail();
}
}
// Test membership for values we did not insert. It is possible to get
// false positives
LOG.info("Checking for false positives");
for (int i = 0; i < notInserted.length; i++) {
if(bf.membershipTest(notInserted[i])) {
LOG.error("false positive for: " + notInserted[i]);
fail();
}
}
LOG.info("Success!");
}
/** Test a CountingBloomFilter
* @throws UnsupportedEncodingException
*/
public void testCountingBloomFilter() throws UnsupportedEncodingException {
Filter bf = new CountingBloomFilter(128, 2, Hash.JENKINS_HASH);
Key key = new StringKey("toto");
Key k2 = new StringKey("lulu");
Key k3 = new StringKey("mama");
bf.add(key);
bf.add(k2);
bf.add(k3);
assertTrue(bf.membershipTest(key));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
// delete 'key', and check that it is no longer a member
((CountingBloomFilter)bf).delete(key);
assertFalse(bf.membershipTest(key));
// to test for overflows, add 'key' enough times to overflow a 4bit bucket,
// while asserting that it stays a member
for(int i = 0; i < 16; i++){
bf.add(key);
assertTrue(bf.membershipTest(key));
}
// test approximateCount
CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
// test the exact range
for (int i = 0; i < 8; i++) {
bf3.add(key);
bf3.add(k2);
assertEquals(bf3.approximateCount(key), i + 1);
assertEquals(bf3.approximateCount(k2), i + 1);
}
// test gently degraded counting in high-fill, high error rate filter
for (int i = 8; i < 15; i++) {
bf3.add(key);
assertTrue(bf3.approximateCount(key) >= (i + 1));
assertEquals(bf3.approximateCount(k2), 8);
assertEquals(bf3.approximateCount(k3), 0);
}
}
/** Test a DynamicBloomFilter
* @throws UnsupportedEncodingException
*/
public void testDynamicBloomFilter() throws UnsupportedEncodingException {
Filter bf = new DynamicBloomFilter(128, 2, Hash.JENKINS_HASH, 2);
Key key = new StringKey("toto");
Key k2 = new StringKey("lulu");
Key k3 = new StringKey("mama");
bf.add(key);
bf.add(k2);
bf.add(k3);
assertTrue(bf.membershipTest(key));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
assertFalse(bf.membershipTest(new StringKey("abcd")));
}
}//end class