HBASE-1381 Remove onelab and bloom filters files from hbase
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@772429 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e78e38dce8
commit
2597f74d10
|
@ -17,6 +17,7 @@ Release 0.20.0 - Unreleased
|
|||
HBASE-1342 Add to filesystem info needed to rebuild .META.
|
||||
HBASE-1361 Disable bloom filters
|
||||
HBASE-1367 Get rid of Thrift exception 'NotFound'
|
||||
HBASE-1381 Remove onelab and bloom filters files from hbase
|
||||
|
||||
BUG FIXES
|
||||
HBASE-1140 "ant clean test" fails (Nitay Joffe via Stack)
|
||||
|
|
|
@ -1,260 +0,0 @@
|
|||
/**
|
||||
* Copyright 2008 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.io;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HStoreKey;
|
||||
import org.apache.hadoop.hbase.util.Hash;
|
||||
import org.apache.hadoop.hbase.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hadoop.io.WritableComparable;
|
||||
import org.onelab.filter.BloomFilter;
|
||||
import org.onelab.filter.Key;
|
||||
|
||||
/**
|
||||
* On write, all keys are added to a bloom filter. On read, all keys are
|
||||
* tested first against bloom filter. Keys are HStoreKey. If passed bloom
|
||||
* filter is null, just passes invocation to parent.
|
||||
*/
|
||||
// TODO should be fixed generic warnings from MapFile methods
|
||||
@SuppressWarnings("unchecked")
|
||||
public class BloomFilterMapFile extends HBaseMapFile {
|
||||
static final Log LOG = LogFactory.getLog(BloomFilterMapFile.class);
|
||||
protected static final String BLOOMFILTER_FILE_NAME = "filter";
|
||||
|
||||
public static class Reader extends HBaseReader {
|
||||
private final BloomFilter bloomFilter;
|
||||
|
||||
/**
|
||||
* @param fs
|
||||
* @param dirName
|
||||
* @param conf
|
||||
* @param filter
|
||||
* @param blockCacheEnabled
|
||||
* @param hri
|
||||
* @throws IOException
|
||||
*/
|
||||
public Reader(FileSystem fs, String dirName, Configuration conf,
|
||||
final boolean filter, final boolean blockCacheEnabled,
|
||||
HRegionInfo hri)
|
||||
throws IOException {
|
||||
super(fs, dirName, conf, blockCacheEnabled, hri);
|
||||
if (filter) {
|
||||
this.bloomFilter = loadBloomFilter(fs, dirName);
|
||||
} else {
|
||||
this.bloomFilter = null;
|
||||
}
|
||||
}
|
||||
|
||||
private BloomFilter loadBloomFilter(FileSystem fs, String dirName)
|
||||
throws IOException {
|
||||
Path filterFile = new Path(dirName, BLOOMFILTER_FILE_NAME);
|
||||
if(!fs.exists(filterFile)) {
|
||||
LOG.warn("FileNotFound: " + filterFile + "; proceeding without");
|
||||
return null;
|
||||
}
|
||||
BloomFilter filter = new BloomFilter();
|
||||
FSDataInputStream in = fs.open(filterFile);
|
||||
try {
|
||||
filter.readFields(in);
|
||||
} finally {
|
||||
in.close();
|
||||
}
|
||||
return filter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.apache.hadoop.hbase.io.MapFile.Reader#get(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
|
||||
*/
|
||||
@Override
|
||||
public Writable get(WritableComparable key, Writable val)
|
||||
throws IOException {
|
||||
if (bloomFilter == null) {
|
||||
return super.get(key, val);
|
||||
}
|
||||
if(bloomFilter.membershipTest(getBloomFilterKey(key))) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("bloom filter reported that key exists");
|
||||
}
|
||||
return super.get(key, val);
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("bloom filter reported that key does not exist");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.apache.hadoop.hbase.io.MapFile.Reader#getClosest(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
|
||||
*/
|
||||
@Override
|
||||
public WritableComparable getClosest(WritableComparable key,
|
||||
Writable val) throws IOException {
|
||||
if (bloomFilter == null) {
|
||||
return super.getClosest(key, val);
|
||||
}
|
||||
// Note - the key being passed to us is always a HStoreKey
|
||||
if(bloomFilter.membershipTest(getBloomFilterKey(key))) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("bloom filter reported that key exists");
|
||||
}
|
||||
return super.getClosest(key, val);
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("bloom filter reported that key does not exist");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return size of the bloom filter
|
||||
*/
|
||||
public int getBloomFilterSize() {
|
||||
return bloomFilter == null ? 0 : bloomFilter.getVectorSize();
|
||||
}
|
||||
}
|
||||
|
||||
public static class Writer extends HBaseWriter {
|
||||
private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0;
|
||||
private final BloomFilter bloomFilter;
|
||||
private final String dirName;
|
||||
private final FileSystem fs;
|
||||
|
||||
/**
|
||||
* @param conf
|
||||
* @param fs
|
||||
* @param dirName
|
||||
* @param compression
|
||||
* @param filter
|
||||
* @param nrows
|
||||
* @param hri
|
||||
* @throws IOException
|
||||
*/
|
||||
public Writer(Configuration conf, FileSystem fs, String dirName,
|
||||
SequenceFile.CompressionType compression, final boolean filter,
|
||||
int nrows, final HRegionInfo hri)
|
||||
throws IOException {
|
||||
super(conf, fs, dirName, compression, hri);
|
||||
this.dirName = dirName;
|
||||
this.fs = fs;
|
||||
if (filter) {
|
||||
/*
|
||||
* There is no way to automatically determine the vector size and the
|
||||
* number of hash functions to use. In particular, bloom filters are
|
||||
* very sensitive to the number of elements inserted into them. For
|
||||
* HBase, the number of entries depends on the size of the data stored
|
||||
* in the column. Currently the default region size is 256MB, so the
|
||||
* number of entries is approximately
|
||||
* 256MB / (average value size for column).
|
||||
*
|
||||
* If m denotes the number of bits in the Bloom filter (vectorSize),
|
||||
* n denotes the number of elements inserted into the Bloom filter and
|
||||
* k represents the number of hash functions used (nbHash), then
|
||||
* according to Broder and Mitzenmacher,
|
||||
*
|
||||
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
|
||||
*
|
||||
* the probability of false positives is minimized when k is
|
||||
* approximately m/n ln(2).
|
||||
*
|
||||
* If we fix the number of hash functions and know the number of
|
||||
* entries, then the optimal vector size m = (k * n) / ln(2)
|
||||
*/
|
||||
BloomFilter f = null;
|
||||
try {
|
||||
f = new BloomFilter(
|
||||
(int) Math.ceil(
|
||||
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * nrows)) /
|
||||
Math.log(2.0)),
|
||||
(int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS,
|
||||
Hash.getHashType(conf)
|
||||
);
|
||||
} catch (IllegalArgumentException e) {
|
||||
LOG.warn("Failed creating bloomfilter; proceeding without", e);
|
||||
}
|
||||
this.bloomFilter = f;
|
||||
} else {
|
||||
this.bloomFilter = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.apache.hadoop.hbase.io.MapFile.Writer#append(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable)
|
||||
*/
|
||||
@Override
|
||||
public void append(WritableComparable key, Writable val)
|
||||
throws IOException {
|
||||
if (bloomFilter != null) {
|
||||
bloomFilter.add(getBloomFilterKey(key));
|
||||
}
|
||||
super.append(key, val);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.apache.hadoop.hbase.io.MapFile.Writer#close()
|
||||
*/
|
||||
@Override
|
||||
public synchronized void close() throws IOException {
|
||||
super.close();
|
||||
if (this.bloomFilter != null) {
|
||||
flushBloomFilter();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flushes bloom filter to disk
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
private void flushBloomFilter() throws IOException {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("flushing bloom filter for " + this.dirName);
|
||||
}
|
||||
FSDataOutputStream out =
|
||||
fs.create(new Path(dirName, BLOOMFILTER_FILE_NAME));
|
||||
try {
|
||||
bloomFilter.write(out);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("flushed bloom filter for " + this.dirName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom bloom filter key maker.
|
||||
* @param key
|
||||
* @return Key made of bytes of row only.
|
||||
*/
|
||||
protected static Key getBloomFilterKey(WritableComparable key) {
|
||||
return new Key(((HStoreKey) key).getRow());
|
||||
}
|
||||
}
|
|
@ -44,7 +44,7 @@ import org.apache.hadoop.io.WritableComparable;
|
|||
* <p>This file is not splitable. Calls to {@link #midKey()} return null.
|
||||
*/
|
||||
//TODO should be fixed generic warnings from MapFile methods
|
||||
public class HalfMapFileReader extends BloomFilterMapFile.Reader {
|
||||
public class HalfMapFileReader extends HBaseMapFile.HBaseReader {
|
||||
private final boolean top;
|
||||
private final HStoreKey midkey;
|
||||
private boolean firstNextCall = true;
|
||||
|
@ -63,7 +63,7 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader {
|
|||
final WritableComparable<HStoreKey> mk,
|
||||
final HRegionInfo hri)
|
||||
throws IOException {
|
||||
this(fs, dirName, conf, r, mk, false, false, hri);
|
||||
this(fs, dirName, conf, r, mk, false, hri);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -72,18 +72,17 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader {
|
|||
* @param conf
|
||||
* @param r
|
||||
* @param mk
|
||||
* @param filter
|
||||
* @param blockCacheEnabled
|
||||
* @param hri
|
||||
* @throws IOException
|
||||
*/
|
||||
public HalfMapFileReader(final FileSystem fs, final String dirName,
|
||||
final Configuration conf, final Range r,
|
||||
final WritableComparable<HStoreKey> mk, final boolean filter,
|
||||
final WritableComparable<HStoreKey> mk,
|
||||
final boolean blockCacheEnabled,
|
||||
final HRegionInfo hri)
|
||||
throws IOException {
|
||||
super(fs, dirName, conf, filter, blockCacheEnabled, hri);
|
||||
super(fs, dirName, conf, blockCacheEnabled, hri);
|
||||
// This is not actual midkey for this half-file; its just border
|
||||
// around which we split top and bottom. Have to look in files to find
|
||||
// actual last and first keys for bottom and top halves. Half-files don't
|
||||
|
|
|
@ -24,9 +24,6 @@ import org.apache.hadoop.conf.Configuration;
|
|||
* This class represents a common API for hashing functions.
|
||||
*/
|
||||
public abstract class Hash {
|
||||
// TODO: Fix the design tangle that has classes over in org.onelab.filter
|
||||
// referring to this class. Would need to also move the Jenkins and Murmur
|
||||
// hashing function too.
|
||||
/** Constant to denote invalid hash type. */
|
||||
public static final int INVALID_HASH = -1;
|
||||
/** Constant to denote {@link JenkinsHash}. */
|
||||
|
|
|
@ -1,238 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
|
||||
|
||||
/**
|
||||
* Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
|
||||
* <p>
|
||||
* The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by
|
||||
* the networking research community in the past decade thanks to the bandwidth efficiencies that it
|
||||
* offers for the transmission of set membership information between networked hosts. A sender encodes
|
||||
* the information into a bit vector, the Bloom filter, that is more compact than a conventional
|
||||
* representation. Computation and space costs for construction are linear in the number of elements.
|
||||
* The receiver uses the filter to test whether various elements are members of the set. Though the
|
||||
* filter will occasionally return a false positive, it will never return a false negative. When creating
|
||||
* the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 2 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*
|
||||
* @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
|
||||
*/
|
||||
public class BloomFilter extends Filter {
|
||||
private static final byte[] bitvalues = new byte[] {
|
||||
(byte)0x01,
|
||||
(byte)0x02,
|
||||
(byte)0x04,
|
||||
(byte)0x08,
|
||||
(byte)0x10,
|
||||
(byte)0x20,
|
||||
(byte)0x40,
|
||||
(byte)0x80
|
||||
};
|
||||
|
||||
/** The bit vector. */
|
||||
BitSet bits;
|
||||
|
||||
/** Default constructor - use with readFields */
|
||||
public BloomFilter() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @param vectorSize The vector size of <i>this</i> filter.
|
||||
* @param nbHash The number of hash function to consider.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
*/
|
||||
public BloomFilter(int vectorSize, int nbHash, int hashType){
|
||||
super(vectorSize, nbHash, hashType);
|
||||
|
||||
bits = new BitSet(this.vectorSize);
|
||||
}//end constructor
|
||||
|
||||
@Override
|
||||
public void add(Key key) {
|
||||
if(key == null) {
|
||||
throw new NullPointerException("key cannot be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
bits.set(h[i]);
|
||||
}
|
||||
}//end add()
|
||||
|
||||
@Override
|
||||
public void and(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof BloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be and-ed");
|
||||
}
|
||||
|
||||
this.bits.and(((BloomFilter) filter).bits);
|
||||
}//end and()
|
||||
|
||||
@Override
|
||||
public boolean membershipTest(Key key){
|
||||
if(key == null) {
|
||||
throw new NullPointerException("key cannot be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
if(!bits.get(h[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}//end memberhsipTest()
|
||||
|
||||
@Override
|
||||
public void not(){
|
||||
bits.flip(0, vectorSize - 1);
|
||||
}//end not()
|
||||
|
||||
@Override
|
||||
public void or(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof BloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be or-ed");
|
||||
}
|
||||
bits.or(((BloomFilter) filter).bits);
|
||||
}//end or()
|
||||
|
||||
@Override
|
||||
public void xor(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof BloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be xor-ed");
|
||||
}
|
||||
bits.xor(((BloomFilter) filter).bits);
|
||||
}//and xor()
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
return bits.toString();
|
||||
}//end toString()
|
||||
|
||||
@Override
|
||||
public Object clone(){
|
||||
BloomFilter bf = new BloomFilter(vectorSize, nbHash, hashType);
|
||||
bf.or(this);
|
||||
return bf;
|
||||
}//end clone()
|
||||
|
||||
/**
|
||||
* @return size of the the bloomfilter
|
||||
*/
|
||||
public int getVectorSize() {
|
||||
return this.vectorSize;
|
||||
}
|
||||
|
||||
// Writable
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
super.write(out);
|
||||
byte[] bytes = new byte[getNBytes()];
|
||||
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
|
||||
if (bitIndex == 8) {
|
||||
bitIndex = 0;
|
||||
byteIndex++;
|
||||
}
|
||||
if (bitIndex == 0) {
|
||||
bytes[byteIndex] = 0;
|
||||
}
|
||||
if (bits.get(i)) {
|
||||
bytes[byteIndex] |= bitvalues[bitIndex];
|
||||
}
|
||||
}
|
||||
out.write(bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
super.readFields(in);
|
||||
bits = new BitSet(this.vectorSize);
|
||||
byte[] bytes = new byte[getNBytes()];
|
||||
in.readFully(bytes);
|
||||
for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
|
||||
if (bitIndex == 8) {
|
||||
bitIndex = 0;
|
||||
byteIndex++;
|
||||
}
|
||||
if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
|
||||
bits.set(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* @return number of bytes needed to hold bit vector */
|
||||
private int getNBytes() {
|
||||
return (vectorSize + 7) / 8;
|
||||
}
|
||||
}//end class
|
|
@ -1,309 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN
|
||||
* 2000 paper.
|
||||
* <p>
|
||||
* A counting Bloom filter is an improvement to standard a Bloom filter as it
|
||||
* allows dynamic additions and deletions of set membership information. This
|
||||
* is achieved through the use of a counting vector instead of a bit vector.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.1 - 19 Jan. 08
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*
|
||||
* @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a>
|
||||
*/
|
||||
public final class CountingBloomFilter extends Filter {
|
||||
/** Storage for the counting buckets */
|
||||
private long[] buckets;
|
||||
|
||||
/** We are using 4bit buckets, so each bucket can count to 15 */
|
||||
private final static long BUCKET_MAX_VALUE = 15;
|
||||
|
||||
/** Default constructor - use with readFields */
|
||||
public CountingBloomFilter() {}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @param vectorSize The vector size of <i>this</i> filter.
|
||||
* @param nbHash The number of hash function to consider.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
*/
|
||||
public CountingBloomFilter(int vectorSize, int nbHash, int hashType){
|
||||
super(vectorSize, nbHash, hashType);
|
||||
buckets = new long[buckets2words(vectorSize)];
|
||||
}//end constructor
|
||||
|
||||
/** returns the number of 64 bit words it would take to hold vectorSize buckets */
|
||||
private static int buckets2words(int vectorSize) {
|
||||
return ((vectorSize - 1) >>> 4) + 1;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void add(Key key) {
|
||||
if(key == null) {
|
||||
throw new NullPointerException("key can not be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
// find the bucket
|
||||
int wordNum = h[i] >> 4; // div 16
|
||||
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
|
||||
|
||||
// only increment if the count in the bucket is less than BUCKET_MAX_VALUE
|
||||
if(bucketValue < BUCKET_MAX_VALUE) {
|
||||
// increment by 1
|
||||
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift);
|
||||
}
|
||||
}
|
||||
}//end add()
|
||||
|
||||
/**
|
||||
* Removes a specified key from <i>this</i> counting Bloom filter.
|
||||
* <p>
|
||||
* <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter.
|
||||
* @param key The key to remove.
|
||||
*/
|
||||
public void delete(Key key) {
|
||||
if(key == null) {
|
||||
throw new NullPointerException("Key may not be null");
|
||||
}
|
||||
if(!membershipTest(key)) {
|
||||
throw new IllegalArgumentException("Key is not a member");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
// find the bucket
|
||||
int wordNum = h[i] >> 4; // div 16
|
||||
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
|
||||
|
||||
// only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE
|
||||
if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) {
|
||||
// decrement by 1
|
||||
buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift);
|
||||
}
|
||||
}
|
||||
}//end delete
|
||||
|
||||
@Override
|
||||
public void and(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof CountingBloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be and-ed");
|
||||
}
|
||||
CountingBloomFilter cbf = (CountingBloomFilter)filter;
|
||||
|
||||
int sizeInWords = buckets2words(vectorSize);
|
||||
for(int i = 0; i < sizeInWords; i++) {
|
||||
this.buckets[i] &= cbf.buckets[i];
|
||||
}
|
||||
}//end and()
|
||||
|
||||
@Override
|
||||
public boolean membershipTest(Key key){
|
||||
if(key == null) {
|
||||
throw new NullPointerException("Key may not be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
// find the bucket
|
||||
int wordNum = h[i] >> 4; // div 16
|
||||
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
|
||||
if((buckets[wordNum] & bucketMask) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}//end membershipTest()
|
||||
|
||||
/**
|
||||
* This method calculates an approximate count of the key, i.e. how many
|
||||
* times the key was added to the filter. This allows the filter to be
|
||||
* used as an approximate <code>key -> count</code> map.
|
||||
* <p>NOTE: due to the bucket size of this filter, inserting the same
|
||||
* key more than 15 times will cause an overflow at all filter positions
|
||||
* associated with this key, and it will significantly increase the error
|
||||
* rate for this and other keys. For this reason the filter can only be
|
||||
* used to store small count values <code>0 <= N << 15</code>.
|
||||
* @param key key to be tested
|
||||
* @return 0 if the key is not present. Otherwise, a positive value v will
|
||||
* be returned such that <code>v == count</code> with probability equal to the
|
||||
* error rate of this filter, and <code>v > count</code> otherwise.
|
||||
* Additionally, if the filter experienced an underflow as a result of
|
||||
* {@link #delete(Key)} operation, the return value may be lower than the
|
||||
* <code>count</code> with the probability of the false negative rate of such
|
||||
* filter.
|
||||
*/
|
||||
public int approximateCount(Key key) {
|
||||
int res = Integer.MAX_VALUE;
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
for (int i = 0; i < nbHash; i++) {
|
||||
// find the bucket
|
||||
int wordNum = h[i] >> 4; // div 16
|
||||
int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
|
||||
if (bucketValue < res) res = (int)bucketValue;
|
||||
}
|
||||
if (res != Integer.MAX_VALUE) {
|
||||
return res;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void not(){
|
||||
throw new UnsupportedOperationException("not() is undefined for "
|
||||
+ this.getClass().getName());
|
||||
}//end not()
|
||||
|
||||
@Override
|
||||
public void or(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof CountingBloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be or-ed");
|
||||
}
|
||||
|
||||
CountingBloomFilter cbf = (CountingBloomFilter)filter;
|
||||
|
||||
int sizeInWords = buckets2words(vectorSize);
|
||||
for(int i = 0; i < sizeInWords; i++) {
|
||||
this.buckets[i] |= cbf.buckets[i];
|
||||
}
|
||||
}//end or()
|
||||
|
||||
@Override
|
||||
public void xor(Filter filter){
|
||||
throw new UnsupportedOperationException("xor() is undefined for "
|
||||
+ this.getClass().getName());
|
||||
}//end xor()
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
StringBuilder res = new StringBuilder();
|
||||
|
||||
for(int i = 0; i < vectorSize; i++) {
|
||||
if(i > 0) {
|
||||
res.append(" ");
|
||||
}
|
||||
|
||||
int wordNum = i >> 4; // div 16
|
||||
int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4
|
||||
|
||||
long bucketMask = 15L << bucketShift;
|
||||
long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift;
|
||||
|
||||
res.append(bucketValue);
|
||||
}
|
||||
|
||||
return res.toString();
|
||||
}//end toString()
|
||||
|
||||
@Override
|
||||
public Object clone(){
|
||||
CountingBloomFilter cbf = new CountingBloomFilter(vectorSize, nbHash, hashType);
|
||||
cbf.buckets = this.buckets.clone();
|
||||
return cbf;
|
||||
}
|
||||
|
||||
// Writable
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
super.write(out);
|
||||
int sizeInWords = buckets2words(vectorSize);
|
||||
for(int i = 0; i < sizeInWords; i++) {
|
||||
out.writeLong(buckets[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
super.readFields(in);
|
||||
int sizeInWords = buckets2words(vectorSize);
|
||||
buckets = new long[sizeInWords];
|
||||
for(int i = 0; i < sizeInWords; i++) {
|
||||
buckets[i] = in.readLong();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,301 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Implements a <i>dynamic Bloom filter</i>, as defined in the INFOCOM 2006 paper.
|
||||
* <p>
|
||||
* A dynamic Bloom filter (DBF) makes use of a <code>s * m</code> bit matrix but
|
||||
* each of the <code>s</code> rows is a standard Bloom filter. The creation
|
||||
* process of a DBF is iterative. At the start, the DBF is a <code>1 * m</code>
|
||||
* bit matrix, i.e., it is composed of a single standard Bloom filter.
|
||||
* It assumes that <code>n<sub>r</sub></code> elements are recorded in the
|
||||
* initial bit vector, where <code>n<sub>r</sub> <= n</code> (<code>n</code> is
|
||||
* the cardinality of the set <code>A</code> to record in the filter).
|
||||
* <p>
|
||||
* As the size of <code>A</code> grows during the execution of the application,
|
||||
* several keys must be inserted in the DBF. When inserting a key into the DBF,
|
||||
* one must first get an active Bloom filter in the matrix. A Bloom filter is
|
||||
* active when the number of recorded keys, <code>n<sub>r</sub></code>, is
|
||||
* strictly less than the current cardinality of <code>A</code>, <code>n</code>.
|
||||
* If an active Bloom filter is found, the key is inserted and
|
||||
* <code>n<sub>r</sub></code> is incremented by one. On the other hand, if there
|
||||
* is no active Bloom filter, a new one is created (i.e., a new row is added to
|
||||
* the matrix) according to the current size of <code>A</code> and the element
|
||||
* is added in this new Bloom filter and the <code>n<sub>r</sub></code> value of
|
||||
* this new Bloom filter is set to one. A given key is said to belong to the
|
||||
* DBF if the <code>k</code> positions are set to one in one of the matrix rows.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 6 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
* @see org.onelab.filter.BloomFilter A Bloom filter
|
||||
*
|
||||
* @see <a href="http://www.cse.fau.edu/~jie/research/publications/Publication_files/infocom2006.pdf">Theory and Network Applications of Dynamic Bloom Filters</a>
|
||||
*/
|
||||
public class DynamicBloomFilter extends Filter {
|
||||
/**
|
||||
* Threshold for the maximum number of key to record in a dynamic Bloom filter row.
|
||||
*/
|
||||
private int nr;
|
||||
|
||||
/**
|
||||
* The number of keys recorded in the current standard active Bloom filter.
|
||||
*/
|
||||
private int currentNbRecord;
|
||||
|
||||
/**
|
||||
* The matrix of Bloom filter.
|
||||
*/
|
||||
private BloomFilter[] matrix;
|
||||
|
||||
/**
|
||||
* Zero-args constructor for the serialization.
|
||||
*/
|
||||
public DynamicBloomFilter() { }
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* <p>
|
||||
* Builds an empty Dynamic Bloom filter.
|
||||
* @param vectorSize The number of bits in the vector.
|
||||
* @param nbHash The number of hash function to consider.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
* @param nr The threshold for the maximum number of keys to record in a dynamic Bloom filter row.
|
||||
*/
|
||||
public DynamicBloomFilter(int vectorSize, int nbHash, int hashType, int nr) {
|
||||
super(vectorSize, nbHash, hashType);
|
||||
|
||||
this.nr = nr;
|
||||
this.currentNbRecord = 0;
|
||||
|
||||
matrix = new BloomFilter[1];
|
||||
matrix[0] = new BloomFilter(this.vectorSize, this.nbHash, this.hashType);
|
||||
}//end constructor
|
||||
|
||||
@Override
|
||||
public void add(Key key){
|
||||
if(key == null) {
|
||||
throw new NullPointerException("Key can not be null");
|
||||
}
|
||||
|
||||
BloomFilter bf = getActiveStandardBF();
|
||||
|
||||
if(bf == null){
|
||||
addRow();
|
||||
bf = matrix[matrix.length - 1];
|
||||
currentNbRecord = 0;
|
||||
}
|
||||
|
||||
bf.add(key);
|
||||
|
||||
currentNbRecord++;
|
||||
}//end add()
|
||||
|
||||
@Override
|
||||
public void and(Filter filter) {
|
||||
if(filter == null
|
||||
|| !(filter instanceof DynamicBloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be and-ed");
|
||||
}
|
||||
|
||||
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
|
||||
|
||||
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
|
||||
throw new IllegalArgumentException("filters cannot be and-ed");
|
||||
}
|
||||
|
||||
for(int i = 0; i < matrix.length; i++) {
|
||||
matrix[i].and(dbf.matrix[i]);
|
||||
}
|
||||
}//end and()
|
||||
|
||||
@Override
|
||||
public boolean membershipTest(Key key){
|
||||
if(key == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for(int i = 0; i < matrix.length; i++) {
|
||||
if(matrix[i].membershipTest(key)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}//end membershipTest()
|
||||
|
||||
@Override
|
||||
public void not(){
|
||||
for(int i = 0; i < matrix.length; i++) {
|
||||
matrix[i].not();
|
||||
}
|
||||
}//end not()
|
||||
|
||||
@Override
|
||||
public void or(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof DynamicBloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be or-ed");
|
||||
}
|
||||
|
||||
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
|
||||
|
||||
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
|
||||
throw new IllegalArgumentException("filters cannot be or-ed");
|
||||
}
|
||||
for(int i = 0; i < matrix.length; i++) {
|
||||
matrix[i].or(dbf.matrix[i]);
|
||||
}
|
||||
}//end or()
|
||||
|
||||
@Override
|
||||
public void xor(Filter filter){
|
||||
if(filter == null
|
||||
|| !(filter instanceof DynamicBloomFilter)
|
||||
|| filter.vectorSize != this.vectorSize
|
||||
|| filter.nbHash != this.nbHash) {
|
||||
throw new IllegalArgumentException("filters cannot be xor-ed");
|
||||
}
|
||||
DynamicBloomFilter dbf = (DynamicBloomFilter)filter;
|
||||
|
||||
if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) {
|
||||
throw new IllegalArgumentException("filters cannot be xor-ed");
|
||||
}
|
||||
|
||||
for(int i = 0; i<matrix.length; i++) {
|
||||
matrix[i].xor(dbf.matrix[i]);
|
||||
}
|
||||
}//end xor()
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
StringBuilder res = new StringBuilder();
|
||||
|
||||
for(int i=0; i<matrix.length; i++) {
|
||||
res.append(matrix[i]);
|
||||
res.append(Character.LINE_SEPARATOR);
|
||||
}
|
||||
return res.toString();
|
||||
}//end toString()
|
||||
|
||||
@Override
|
||||
public Object clone(){
|
||||
DynamicBloomFilter dbf = new DynamicBloomFilter(vectorSize, nbHash, hashType, nr);
|
||||
dbf.currentNbRecord = this.currentNbRecord;
|
||||
dbf.matrix = new BloomFilter[this.matrix.length];
|
||||
for(int i = 0; i < this.matrix.length; i++) {
|
||||
dbf.matrix[i] = (BloomFilter)this.matrix[i].clone();
|
||||
}
|
||||
return dbf;
|
||||
}//end clone()
|
||||
|
||||
// Writable
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
super.write(out);
|
||||
out.writeInt(nr);
|
||||
out.writeInt(currentNbRecord);
|
||||
out.writeInt(matrix.length);
|
||||
for (int i = 0; i < matrix.length; i++) {
|
||||
matrix[i].write(out);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
super.readFields(in);
|
||||
nr = in.readInt();
|
||||
currentNbRecord = in.readInt();
|
||||
int len = in.readInt();
|
||||
matrix = new BloomFilter[len];
|
||||
for (int i = 0; i < matrix.length; i++) {
|
||||
matrix[i] = new BloomFilter();
|
||||
matrix[i].readFields(in);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new row to <i>this</i> dynamic Bloom filter.
|
||||
*/
|
||||
private void addRow(){
|
||||
BloomFilter[] tmp = new BloomFilter[matrix.length + 1];
|
||||
|
||||
for(int i = 0; i < matrix.length; i++) {
|
||||
tmp[i] = (BloomFilter)matrix[i].clone();
|
||||
}
|
||||
|
||||
tmp[tmp.length-1] = new BloomFilter(vectorSize, nbHash, hashType);
|
||||
|
||||
matrix = tmp;
|
||||
}//end addRow()
|
||||
|
||||
/**
|
||||
* Returns the active standard Bloom filter in <i>this</i> dynamic Bloom filter.
|
||||
* @return BloomFilter The active standard Bloom filter.
|
||||
* <code>Null</code> otherwise.
|
||||
*/
|
||||
private BloomFilter getActiveStandardBF() {
|
||||
if(currentNbRecord >= nr) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return matrix[matrix.length - 1];
|
||||
}//end getActiveStandardBF()
|
||||
}//end class
|
|
@ -1,216 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819
|
||||
* (http://www.one-lab.org)
|
||||
*
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hadoop.hbase.util.Hash;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
|
||||
/**
|
||||
* Defines the general behavior of a filter.
|
||||
* <p>
|
||||
* A filter is a data structure which aims at offering a lossy summary of a set <code>A</code>. The
|
||||
* key idea is to map entries of <code>A</code> (also called <i>keys</i>) into several positions
|
||||
* in a vector through the use of several hash functions.
|
||||
* <p>
|
||||
* Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension).
|
||||
* <p>
|
||||
* It must be extended in order to define the real behavior.
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*
|
||||
* @version 1.0 - 2 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Key The general behavior of a key
|
||||
* @see org.onelab.filter.HashFunction A hash function
|
||||
*/
|
||||
public abstract class Filter implements Writable {
|
||||
private static final int VERSION = -1; // negative to accommodate for old format
|
||||
/** The vector size of <i>this</i> filter. */
|
||||
protected int vectorSize;
|
||||
|
||||
/** The hash function used to map a key to several positions in the vector. */
|
||||
protected HashFunction hash;
|
||||
|
||||
/** The number of hash function to consider. */
|
||||
protected int nbHash;
|
||||
|
||||
/** Type of hashing function to use. */
|
||||
protected int hashType;
|
||||
|
||||
protected Filter() {}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param vectorSize The vector size of <i>this</i> filter.
|
||||
* @param nbHash The number of hash functions to consider.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
*/
|
||||
protected Filter(int vectorSize, int nbHash, int hashType) {
|
||||
this.vectorSize = vectorSize;
|
||||
this.nbHash = nbHash;
|
||||
this.hashType = hashType;
|
||||
this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType);
|
||||
}//end constructor
|
||||
|
||||
/**
|
||||
* Adds a key to <i>this</i> filter.
|
||||
* @param key The key to add.
|
||||
*/
|
||||
public abstract void add(Key key);
|
||||
|
||||
/**
|
||||
* Determines wether a specified key belongs to <i>this</i> filter.
|
||||
* @param key The key to test.
|
||||
* @return boolean True if the specified key belongs to <i>this</i> filter.
|
||||
* False otherwise.
|
||||
*/
|
||||
public abstract boolean membershipTest(Key key);
|
||||
|
||||
/**
|
||||
* Peforms a logical AND between <i>this</i> filter and a specified filter.
|
||||
* <p>
|
||||
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
|
||||
* @param filter The filter to AND with.
|
||||
*/
|
||||
public abstract void and(Filter filter);
|
||||
|
||||
/**
|
||||
* Peforms a logical OR between <i>this</i> filter and a specified filter.
|
||||
* <p>
|
||||
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
|
||||
* @param filter The filter to OR with.
|
||||
*/
|
||||
public abstract void or(Filter filter);
|
||||
|
||||
/**
|
||||
* Peforms a logical XOR between <i>this</i> filter and a specified filter.
|
||||
* <p>
|
||||
* <b>Invariant</b>: The result is assigned to <i>this</i> filter.
|
||||
* @param filter The filter to XOR with.
|
||||
*/
|
||||
public abstract void xor(Filter filter);
|
||||
|
||||
/**
|
||||
* Performs a logical NOT on <i>this</i> filter.
|
||||
* <p>
|
||||
* The result is assigned to <i>this</i> filter.
|
||||
*/
|
||||
public abstract void not();
|
||||
|
||||
/**
|
||||
* Adds a list of keys to <i>this</i> filter.
|
||||
* @param keys The list of keys.
|
||||
*/
|
||||
public void add(List<Key> keys){
|
||||
if(keys == null) {
|
||||
throw new IllegalArgumentException("ArrayList<Key> may not be null");
|
||||
}
|
||||
|
||||
for(Key key: keys) {
|
||||
add(key);
|
||||
}
|
||||
}//end add()
|
||||
|
||||
/**
|
||||
* Adds a collection of keys to <i>this</i> filter.
|
||||
* @param keys The collection of keys.
|
||||
*/
|
||||
public void add(Collection<Key> keys){
|
||||
if(keys == null) {
|
||||
throw new IllegalArgumentException("Collection<Key> may not be null");
|
||||
}
|
||||
for(Key key: keys) {
|
||||
add(key);
|
||||
}
|
||||
}//end add()
|
||||
|
||||
/**
|
||||
* Adds an array of keys to <i>this</i> filter.
|
||||
* @param keys The array of keys.
|
||||
*/
|
||||
public void add(Key[] keys){
|
||||
if(keys == null) {
|
||||
throw new IllegalArgumentException("Key[] may not be null");
|
||||
}
|
||||
for(int i = 0; i < keys.length; i++) {
|
||||
add(keys[i]);
|
||||
}
|
||||
}//end add()
|
||||
|
||||
// Writable interface
|
||||
|
||||
public void write(DataOutput out) throws IOException {
|
||||
out.writeInt(VERSION);
|
||||
out.writeInt(this.nbHash);
|
||||
out.writeByte(this.hashType);
|
||||
out.writeInt(this.vectorSize);
|
||||
}
|
||||
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
int ver = in.readInt();
|
||||
if (ver > 0) { // old unversioned format
|
||||
this.nbHash = ver;
|
||||
this.hashType = Hash.JENKINS_HASH;
|
||||
} else if (ver == VERSION) {
|
||||
this.nbHash = in.readInt();
|
||||
this.hashType = in.readByte();
|
||||
} else {
|
||||
throw new IOException("Unsupported version: " + ver);
|
||||
}
|
||||
this.vectorSize = in.readInt();
|
||||
this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType);
|
||||
}
|
||||
}//end class
|
|
@ -1,127 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819
|
||||
* (http://www.one-lab.org)
|
||||
*
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import org.apache.hadoop.hbase.util.Hash;
|
||||
|
||||
/**
|
||||
* Implements a hash object that returns a certain number of hashed values.
|
||||
* <p>
|
||||
* It is based on the SHA-1 algorithm.
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*
|
||||
* @version 1.0 - 2 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Key The general behavior of a key being stored in a filter
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*
|
||||
* @see <a href="http://www.itl.nist.gov/fipspubs/fip180-1.htm">SHA-1 algorithm</a>
|
||||
*/
|
||||
public final class HashFunction {
|
||||
/** The number of hashed values. */
|
||||
private int nbHash;
|
||||
|
||||
/** The maximum highest returned value. */
|
||||
private int maxValue;
|
||||
|
||||
/** Hashing algorithm to use. */
|
||||
private Hash hashFunction;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* <p>
|
||||
* Builds a hash function that must obey to a given maximum number of returned values and a highest value.
|
||||
* @param maxValue The maximum highest returned value.
|
||||
* @param nbHash The number of resulting hashed values.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
*/
|
||||
public HashFunction(int maxValue, int nbHash, int hashType) {
|
||||
if(maxValue <= 0) {
|
||||
throw new IllegalArgumentException("maxValue must be > 0");
|
||||
}
|
||||
|
||||
if(nbHash <= 0) {
|
||||
throw new IllegalArgumentException("nbHash must be > 0");
|
||||
}
|
||||
|
||||
this.maxValue = maxValue;
|
||||
this.nbHash = nbHash;
|
||||
this.hashFunction = Hash.getInstance(hashType);
|
||||
if (this.hashFunction == null)
|
||||
throw new IllegalArgumentException("hashType must be known");
|
||||
}//end constructor
|
||||
|
||||
/** Clears <i>this</i> hash function. A NOOP */
|
||||
public void clear() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Hashes a specified key into several integers.
|
||||
* @param k The specified key.
|
||||
* @return The array of hashed values.
|
||||
*/
|
||||
public int[] hash(Key k){
|
||||
byte[] b = k.getBytes();
|
||||
if(b == null) {
|
||||
throw new NullPointerException("buffer reference is null");
|
||||
}
|
||||
if(b.length == 0) {
|
||||
throw new IllegalArgumentException("key length must be > 0");
|
||||
}
|
||||
int[] result = new int[nbHash];
|
||||
for (int i = 0, initval = 0; i < nbHash; i++) {
|
||||
initval = hashFunction.hash(b, initval);
|
||||
result[i] = Math.abs(initval) % maxValue;
|
||||
}
|
||||
return result;
|
||||
}//end hash()
|
||||
|
||||
}//end class
|
|
@ -1,174 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.io.WritableComparable;
|
||||
|
||||
/**
|
||||
* The general behavior of a key that must be stored in a filter.
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
*/
|
||||
public class Key implements WritableComparable<Key> {
|
||||
/** Byte value of key */
|
||||
byte[] bytes;
|
||||
|
||||
/**
|
||||
* The weight associated to <i>this</i> key.
|
||||
* <p>
|
||||
* <b>Invariant</b>: if it is not specified, each instance of
|
||||
* <code>Key</code> will have a default weight of 1.0
|
||||
*/
|
||||
double weight;
|
||||
|
||||
/** default constructor - use with readFields */
|
||||
public Key() {}
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* <p>
|
||||
* Builds a key with a default weight.
|
||||
* @param value The byte value of <i>this</i> key.
|
||||
*/
|
||||
public Key(byte[] value) {
|
||||
this(value, 1.0);
|
||||
}//end constructor
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* <p>
|
||||
* Builds a key with a specified weight.
|
||||
* @param value The value of <i>this</i> key.
|
||||
* @param weight The weight associated to <i>this</i> key.
|
||||
*/
|
||||
public Key(byte[] value, double weight) {
|
||||
set(value, weight);
|
||||
}//end constructor
|
||||
|
||||
/**
|
||||
* @param value
|
||||
* @param weight
|
||||
*/
|
||||
public void set(byte[] value, double weight) {
|
||||
if(value == null) {
|
||||
throw new IllegalArgumentException("value can not be null");
|
||||
}
|
||||
this.bytes = value;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
/** @return byte[] The value of <i>this</i> key. */
|
||||
public byte[] getBytes() {
|
||||
return this.bytes;
|
||||
}
|
||||
|
||||
/** @return Returns the weight associated to <i>this</i> key. */
|
||||
public double getWeight(){
|
||||
return weight;
|
||||
}//end getWeight()
|
||||
|
||||
/**
|
||||
* Increments the weight of <i>this</i> key with a specified value.
|
||||
* @param weight The increment.
|
||||
*/
|
||||
public void incrementWeight(double weight){
|
||||
this.weight += weight;
|
||||
}//end incrementWeight()
|
||||
|
||||
/** Increments the weight of <i>this</i> key by one. */
|
||||
public void incrementWeight(){
|
||||
this.weight++;
|
||||
}//end incrementWeight()
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
return this.compareTo((Key)o) == 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = 0;
|
||||
for(int i = 0; i < bytes.length; i++) {
|
||||
result ^= Byte.valueOf(bytes[i]).hashCode();
|
||||
}
|
||||
result ^= Double.valueOf(weight).hashCode();
|
||||
return result;
|
||||
}
|
||||
|
||||
// Writable
|
||||
|
||||
public void write(DataOutput out) throws IOException {
|
||||
out.writeInt(bytes.length);
|
||||
out.write(bytes);
|
||||
out.writeDouble(weight);
|
||||
}
|
||||
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
this.bytes = new byte[in.readInt()];
|
||||
in.readFully(this.bytes);
|
||||
weight = in.readDouble();
|
||||
}
|
||||
|
||||
// Comparable
|
||||
|
||||
public int compareTo(Key o) {
|
||||
int result = this.bytes.length - o.getBytes().length;
|
||||
for(int i = 0; result == 0 && i < bytes.length; i++) {
|
||||
result = this.bytes[i] - o.bytes[i];
|
||||
}
|
||||
|
||||
if(result == 0) {
|
||||
result = Double.valueOf(this.weight - o.weight).intValue();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}//end class
|
|
@ -1,91 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819
|
||||
* (http://www.one-lab.org)
|
||||
*
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
/**
|
||||
* Defines the different remove scheme for retouched Bloom filters.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 7 Feb. 07
|
||||
*/
|
||||
public interface RemoveScheme {
|
||||
/**
|
||||
* Random selection.
|
||||
* <p>
|
||||
* The idea is to randomly select a bit to reset.
|
||||
*/
|
||||
public final static short RANDOM = 0;
|
||||
|
||||
/**
|
||||
* MinimumFN Selection.
|
||||
* <p>
|
||||
* The idea is to select the bit to reset that will generate the minimum
|
||||
* number of false negative.
|
||||
*/
|
||||
public final static short MINIMUM_FN = 1;
|
||||
|
||||
/**
|
||||
* MaximumFP Selection.
|
||||
* <p>
|
||||
* The idea is to select the bit to reset that will remove the maximum number
|
||||
* of false positive.
|
||||
*/
|
||||
public final static short MAXIMUM_FP = 2;
|
||||
|
||||
/**
|
||||
* Ratio Selection.
|
||||
* <p>
|
||||
* The idea is to select the bit to reset that will, at the same time, remove
|
||||
* the maximum number of false positve while minimizing the amount of false
|
||||
* negative generated.
|
||||
*/
|
||||
public final static short RATIO = 3;
|
||||
}//end interface
|
|
@ -1,448 +0,0 @@
|
|||
/**
|
||||
*
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.filter;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
|
||||
* <p>
|
||||
* It allows the removal of selected false positives at the cost of introducing
|
||||
* random false negatives, and with the benefit of eliminating some random false
|
||||
* positives at the same time.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 7 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Filter The general behavior of a filter
|
||||
* @see org.onelab.filter.BloomFilter A Bloom filter
|
||||
* @see org.onelab.filter.RemoveScheme The different selective clearing algorithms
|
||||
*
|
||||
* @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a>
|
||||
*/
|
||||
public final class RetouchedBloomFilter extends BloomFilter
|
||||
implements RemoveScheme {
|
||||
/**
|
||||
* KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
|
||||
*/
|
||||
List<Key>[] fpVector;
|
||||
|
||||
/**
|
||||
* KeyList vector of keys recorded in the filter.
|
||||
*/
|
||||
List<Key>[] keyVector;
|
||||
|
||||
/**
|
||||
* Ratio vector.
|
||||
*/
|
||||
double[] ratio;
|
||||
|
||||
private Random rand;
|
||||
|
||||
/** Default constructor - use with readFields */
|
||||
public RetouchedBloomFilter() {}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @param vectorSize The vector size of <i>this</i> filter.
|
||||
* @param nbHash The number of hash function to consider.
|
||||
* @param hashType type of the hashing function (see {@link Hash}).
|
||||
*/
|
||||
public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) {
|
||||
super(vectorSize, nbHash, hashType);
|
||||
|
||||
this.rand = null;
|
||||
createVector();
|
||||
}//end constructor
|
||||
|
||||
@Override
|
||||
public void add(Key key){
|
||||
if(key == null) {
|
||||
throw new NullPointerException("key can not be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
bits.set(h[i]);
|
||||
keyVector[h[i]].add(key);
|
||||
}//end for - i
|
||||
}//end add()
|
||||
|
||||
/**
|
||||
* Adds a false positive information to <i>this</i> retouched Bloom filter.
|
||||
* <p>
|
||||
* <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens.
|
||||
* @param key The false positive key to add.
|
||||
*/
|
||||
public void addFalsePositive(Key key){
|
||||
if(key == null) {
|
||||
throw new NullPointerException("key can not be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(key);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
fpVector[h[i]].add(key);
|
||||
}
|
||||
}//end addFalsePositive()
|
||||
|
||||
/**
|
||||
* Adds a collection of false positive information to <i>this</i> retouched Bloom filter.
|
||||
* @param coll The collection of false positive.
|
||||
*/
|
||||
public void addFalsePositive(Collection<Key> coll) {
|
||||
if(coll == null) {
|
||||
throw new NullPointerException("Collection<Key> can not be null");
|
||||
}
|
||||
|
||||
for(Key k: coll) {
|
||||
addFalsePositive(k);
|
||||
}
|
||||
}//end addFalsePositive()
|
||||
|
||||
/**
|
||||
* Adds a list of false positive information to <i>this</i> retouched Bloom filter.
|
||||
* @param keys The list of false positive.
|
||||
*/
|
||||
public void addFalsePositive(List<Key> keys){
|
||||
if(keys == null) {
|
||||
throw new NullPointerException("ArrayList<Key> can not be null");
|
||||
}
|
||||
|
||||
for(Key k: keys) {
|
||||
addFalsePositive(k);
|
||||
}
|
||||
}//end addFalsePositive()
|
||||
|
||||
/**
|
||||
* Adds an array of false positive information to <i>this</i> retouched Bloom filter.
|
||||
* @param keys The array of false positive.
|
||||
*/
|
||||
public void addFalsePositive(Key[] keys){
|
||||
if(keys == null) {
|
||||
throw new NullPointerException("Key[] can not be null");
|
||||
}
|
||||
|
||||
for(int i = 0; i < keys.length; i++) {
|
||||
addFalsePositive(keys[i]);
|
||||
}
|
||||
}//end addFalsePositive()
|
||||
|
||||
/**
|
||||
* Performs the selective clearing for a given key.
|
||||
* @param k The false positive key to remove from <i>this</i> retouched Bloom filter.
|
||||
* @param scheme The selective clearing scheme to apply.
|
||||
*/
|
||||
public void selectiveClearing(Key k, short scheme) {
|
||||
if(k == null) {
|
||||
throw new NullPointerException("Key can not be null");
|
||||
}
|
||||
|
||||
if(!membershipTest(k)) {
|
||||
throw new IllegalArgumentException("Key is not a member");
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
int[] h = hash.hash(k);
|
||||
|
||||
switch(scheme) {
|
||||
|
||||
case RANDOM:
|
||||
index = randomRemove();
|
||||
break;
|
||||
|
||||
case MINIMUM_FN:
|
||||
index = minimumFnRemove(h);
|
||||
break;
|
||||
|
||||
case MAXIMUM_FP:
|
||||
index = maximumFpRemove(h);
|
||||
break;
|
||||
|
||||
case RATIO:
|
||||
index = ratioRemove(h);
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new AssertionError("Undefined selective clearing scheme");
|
||||
|
||||
}//end switch
|
||||
|
||||
clearBit(index);
|
||||
}//end selectiveClearing()
|
||||
|
||||
private int randomRemove() {
|
||||
if(rand == null) {
|
||||
rand = new Random();
|
||||
}
|
||||
|
||||
return rand.nextInt(nbHash);
|
||||
}//end randomRemove()
|
||||
|
||||
/**
|
||||
* Chooses the bit position that minimizes the number of false negative generated.
|
||||
* @param h The different bit positions.
|
||||
* @return int The position that minimizes the number of false negative generated.
|
||||
*/
|
||||
private int minimumFnRemove(int[] h) {
|
||||
int minIndex = Integer.MAX_VALUE;
|
||||
double minValue = Double.MAX_VALUE;
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
double keyWeight = getWeight(keyVector[h[i]]);
|
||||
|
||||
if(keyWeight < minValue) {
|
||||
minIndex = h[i];
|
||||
minValue = keyWeight;
|
||||
}
|
||||
|
||||
}//end for - i
|
||||
|
||||
return minIndex;
|
||||
}//end minimumFnRemove()
|
||||
|
||||
/**
|
||||
* Chooses the bit position that maximizes the number of false positive removed.
|
||||
* @param h The different bit positions.
|
||||
* @return int The position that maximizes the number of false positive removed.
|
||||
*/
|
||||
private int maximumFpRemove(int[] h){
|
||||
int maxIndex = Integer.MIN_VALUE;
|
||||
double maxValue = Double.MIN_VALUE;
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
double fpWeight = getWeight(fpVector[h[i]]);
|
||||
|
||||
if(fpWeight > maxValue) {
|
||||
maxValue = fpWeight;
|
||||
maxIndex = h[i];
|
||||
}
|
||||
}
|
||||
|
||||
return maxIndex;
|
||||
}//end maximumFpRemove()
|
||||
|
||||
/**
|
||||
* Chooses the bit position that minimizes the number of false negative generated while maximizing.
|
||||
* the number of false positive removed.
|
||||
* @param h The different bit positions.
|
||||
* @return int The position that minimizes the number of false negative generated while maximizing.
|
||||
*/
|
||||
private int ratioRemove(int[] h){
|
||||
computeRatio();
|
||||
int minIndex = Integer.MAX_VALUE;
|
||||
double minValue = Double.MAX_VALUE;
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
if(ratio[h[i]] < minValue) {
|
||||
minValue = ratio[h[i]];
|
||||
minIndex = h[i];
|
||||
}
|
||||
}//end for - i
|
||||
|
||||
return minIndex;
|
||||
}//end ratioRemove()
|
||||
|
||||
/**
|
||||
* Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors.
|
||||
* @param index The position of the bit to clear.
|
||||
*/
|
||||
private void clearBit(int index){
|
||||
if(index < 0 || index >= vectorSize) {
|
||||
throw new ArrayIndexOutOfBoundsException(index);
|
||||
}
|
||||
|
||||
List<Key> kl = keyVector[index];
|
||||
List<Key> fpl = fpVector[index];
|
||||
|
||||
// update key list
|
||||
int listSize = kl.size();
|
||||
for(int i = 0; i < listSize && !kl.isEmpty(); i++) {
|
||||
removeKey(kl.get(0), keyVector);
|
||||
}
|
||||
|
||||
kl.clear();
|
||||
keyVector[index].clear();
|
||||
|
||||
//update false positive list
|
||||
listSize = fpl.size();
|
||||
for(int i = 0; i < listSize && !fpl.isEmpty(); i++) {
|
||||
removeKey(fpl.get(0), fpVector);
|
||||
}
|
||||
|
||||
fpl.clear();
|
||||
fpVector[index].clear();
|
||||
|
||||
//update ratio
|
||||
ratio[index] = 0.0;
|
||||
|
||||
//update bit vector
|
||||
bits.clear(index);
|
||||
}//end clearBit()
|
||||
|
||||
/**
|
||||
* Removes a given key from <i>this</i> filer.
|
||||
* @param k The key to remove.
|
||||
* @param vector The counting vector associated to the key.
|
||||
*/
|
||||
private void removeKey(Key k, List<Key>[] vector) {
|
||||
if(k == null) {
|
||||
throw new NullPointerException("Key can not be null");
|
||||
}
|
||||
if(vector == null) {
|
||||
throw new NullPointerException("ArrayList<Key>[] can not be null");
|
||||
}
|
||||
|
||||
int[] h = hash.hash(k);
|
||||
hash.clear();
|
||||
|
||||
for(int i = 0; i < nbHash; i++) {
|
||||
vector[h[i]].remove(k);
|
||||
}
|
||||
}//end removeKey()
|
||||
|
||||
/**
|
||||
* Computes the ratio A/FP.
|
||||
*/
|
||||
private void computeRatio() {
|
||||
for(int i = 0; i < vectorSize; i++) {
|
||||
double keyWeight = getWeight(keyVector[i]);
|
||||
double fpWeight = getWeight(fpVector[i]);
|
||||
|
||||
if(keyWeight > 0 && fpWeight > 0) {
|
||||
ratio[i] = keyWeight/fpWeight;
|
||||
}
|
||||
}//end for - i
|
||||
}//end computeRatio()
|
||||
|
||||
private double getWeight(List<Key> keyList) {
|
||||
double weight = 0.0;
|
||||
for(Key k: keyList) {
|
||||
weight += k.getWeight();
|
||||
}
|
||||
return weight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and initialises the various vectors.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private void createVector() {
|
||||
fpVector = new List[vectorSize];
|
||||
keyVector = new List[vectorSize];
|
||||
ratio = new double[vectorSize];
|
||||
|
||||
for(int i = 0; i < vectorSize; i++) {
|
||||
fpVector[i] = Collections.synchronizedList(new ArrayList<Key>());
|
||||
keyVector[i] = Collections.synchronizedList(new ArrayList<Key>());
|
||||
ratio[i] = 0.0;
|
||||
}//end for -i
|
||||
}//end createVector()
|
||||
|
||||
// Writable
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
super.write(out);
|
||||
for(int i = 0; i < fpVector.length; i++) {
|
||||
List<Key> list = fpVector[i];
|
||||
out.writeInt(list.size());
|
||||
for(Key k: list) {
|
||||
k.write(out);
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < keyVector.length; i++) {
|
||||
List<Key> list = keyVector[i];
|
||||
out.writeInt(list.size());
|
||||
for(Key k: list) {
|
||||
k.write(out);
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < ratio.length; i++) {
|
||||
out.writeDouble(ratio[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
super.readFields(in);
|
||||
createVector();
|
||||
for(int i = 0; i < fpVector.length; i++) {
|
||||
List<Key> list = fpVector[i];
|
||||
int size = in.readInt();
|
||||
for(int j = 0; j < size; j++) {
|
||||
Key k = new Key();
|
||||
k.readFields(in);
|
||||
list.add(k);
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < keyVector.length; i++) {
|
||||
List<Key> list = keyVector[i];
|
||||
int size = in.readInt();
|
||||
for(int j = 0; j < size; j++) {
|
||||
Key k = new Key();
|
||||
k.readFields(in);
|
||||
list.add(k);
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < ratio.length; i++) {
|
||||
ratio[i] = in.readDouble();
|
||||
}
|
||||
}
|
||||
}//end class
|
|
@ -1,94 +0,0 @@
|
|||
/**
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819
|
||||
* (http://www.one-lab.org)
|
||||
*
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.test;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.onelab.filter.Key;
|
||||
|
||||
/**
|
||||
* Test class for keys.
|
||||
* <p>
|
||||
* It gives an example on how to extend Key.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 5 Feb. 07
|
||||
*
|
||||
* @see org.onelab.filter.Key A key stored in a filter
|
||||
*/
|
||||
public class StringKey extends Key {
|
||||
|
||||
/** Default constructor - use with readFields */
|
||||
public StringKey() {}
|
||||
|
||||
/**
|
||||
* Construct a Key using the specified String and default weight
|
||||
*
|
||||
* @param key String key value
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public StringKey(String key) throws UnsupportedEncodingException {
|
||||
super(key.getBytes(HConstants.UTF8_ENCODING));
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a Key using the specified string and weight
|
||||
*
|
||||
* @param key - String key value
|
||||
* @param weight key weight
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public StringKey(String key, double weight)
|
||||
throws UnsupportedEncodingException {
|
||||
|
||||
super(key.getBytes(HConstants.UTF8_ENCODING), weight);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,323 +0,0 @@
|
|||
/**
|
||||
* Copyright (c) 2005, European Commission project OneLab under contract 034819
|
||||
* (http://www.one-lab.org)
|
||||
*
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or
|
||||
* without modification, are permitted provided that the following
|
||||
* conditions are met:
|
||||
* - Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the name of the University Catholique de Louvain - UCL
|
||||
* nor the names of its contributors may be used to endorse or
|
||||
* promote products derived from this software without specific prior
|
||||
* written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
||||
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.onelab.test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hbase.util.Hash;
|
||||
import org.onelab.filter.*;
|
||||
|
||||
/**
|
||||
* Test class.
|
||||
*
|
||||
* contract <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
|
||||
*
|
||||
* @version 1.0 - 8 Feb. 07
|
||||
*/
|
||||
public class TestFilter extends TestCase {
|
||||
private static final Log LOG = LogFactory.getLog(TestFilter.class);
|
||||
|
||||
/** Test a BloomFilter
|
||||
* @throws UnsupportedEncodingException
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testBloomFilter() throws UnsupportedEncodingException,
|
||||
IOException {
|
||||
final StringKey[] inserted = {
|
||||
new StringKey("wmjwjzyv"),
|
||||
new StringKey("baietibz"),
|
||||
new StringKey("guhsgxnv"),
|
||||
new StringKey("mhnqycto"),
|
||||
new StringKey("xcyqafgz"),
|
||||
new StringKey("zidoamgb"),
|
||||
new StringKey("tftfirzd"),
|
||||
new StringKey("okapqlrg"),
|
||||
new StringKey("yccwzwsq"),
|
||||
new StringKey("qmonufqu"),
|
||||
new StringKey("wlsctews"),
|
||||
new StringKey("mksdhqri"),
|
||||
new StringKey("wxxllokj"),
|
||||
new StringKey("eviuqpls"),
|
||||
new StringKey("bavotqmj"),
|
||||
new StringKey("yibqzhdl"),
|
||||
new StringKey("csfqmsyr"),
|
||||
new StringKey("guxliyuh"),
|
||||
new StringKey("pzicietj"),
|
||||
new StringKey("qdwgrqwo"),
|
||||
new StringKey("ujfzecmi"),
|
||||
new StringKey("dzeqfvfi"),
|
||||
new StringKey("phoegsij"),
|
||||
new StringKey("bvudfcou"),
|
||||
new StringKey("dowzmciz"),
|
||||
new StringKey("etvhkizp"),
|
||||
new StringKey("rzurqycg"),
|
||||
new StringKey("krqfxuge"),
|
||||
new StringKey("gflcohtd"),
|
||||
new StringKey("fcrcxtps"),
|
||||
new StringKey("qrtovxdq"),
|
||||
new StringKey("aypxwrwi"),
|
||||
new StringKey("dckpyznr"),
|
||||
new StringKey("mdaawnpz"),
|
||||
new StringKey("pakdfvca"),
|
||||
new StringKey("xjglfbez"),
|
||||
new StringKey("xdsecofi"),
|
||||
new StringKey("sjlrfcab"),
|
||||
new StringKey("ebcjawxv"),
|
||||
new StringKey("hkafkjmy"),
|
||||
new StringKey("oimmwaxo"),
|
||||
new StringKey("qcuzrazo"),
|
||||
new StringKey("nqydfkwk"),
|
||||
new StringKey("frybvmlb"),
|
||||
new StringKey("amxmaqws"),
|
||||
new StringKey("gtkovkgx"),
|
||||
new StringKey("vgwxrwss"),
|
||||
new StringKey("xrhzmcep"),
|
||||
new StringKey("tafwziil"),
|
||||
new StringKey("erjmncnv"),
|
||||
new StringKey("heyzqzrn"),
|
||||
new StringKey("sowvyhtu"),
|
||||
new StringKey("heeixgzy"),
|
||||
new StringKey("ktcahcob"),
|
||||
new StringKey("ljhbybgg"),
|
||||
new StringKey("jiqfcksl"),
|
||||
new StringKey("anjdkjhm"),
|
||||
new StringKey("uzcgcuxp"),
|
||||
new StringKey("vzdhjqla"),
|
||||
new StringKey("svhgwwzq"),
|
||||
new StringKey("zhswvhbp"),
|
||||
new StringKey("ueceybwy"),
|
||||
new StringKey("czkqykcw"),
|
||||
new StringKey("ctisayir"),
|
||||
new StringKey("hppbgciu"),
|
||||
new StringKey("nhzgljfk"),
|
||||
new StringKey("vaziqllf"),
|
||||
new StringKey("narvrrij"),
|
||||
new StringKey("kcevbbqi"),
|
||||
new StringKey("qymuaqnp"),
|
||||
new StringKey("pwqpfhsr"),
|
||||
new StringKey("peyeicuk"),
|
||||
new StringKey("kudlwihi"),
|
||||
new StringKey("pkmqejlm"),
|
||||
new StringKey("ylwzjftl"),
|
||||
new StringKey("rhqrlqar"),
|
||||
new StringKey("xmftvzsp"),
|
||||
new StringKey("iaemtihk"),
|
||||
new StringKey("ymsbrqcu"),
|
||||
new StringKey("yfnlcxto"),
|
||||
new StringKey("nluqopqh"),
|
||||
new StringKey("wmrzhtox"),
|
||||
new StringKey("qnffhqbl"),
|
||||
new StringKey("zypqpnbw"),
|
||||
new StringKey("oiokhatd"),
|
||||
new StringKey("mdraddiu"),
|
||||
new StringKey("zqoatltt"),
|
||||
new StringKey("ewhulbtm"),
|
||||
new StringKey("nmswpsdf"),
|
||||
new StringKey("xsjeteqe"),
|
||||
new StringKey("ufubcbma"),
|
||||
new StringKey("phyxvrds"),
|
||||
new StringKey("vhnfldap"),
|
||||
new StringKey("zrrlycmg"),
|
||||
new StringKey("becotcjx"),
|
||||
new StringKey("wvbubokn"),
|
||||
new StringKey("avkgiopr"),
|
||||
new StringKey("mbqqxmrv"),
|
||||
new StringKey("ibplgvuu"),
|
||||
new StringKey("dghvpkgc")
|
||||
};
|
||||
|
||||
final StringKey[] notInserted = {
|
||||
new StringKey("abcdefgh"),
|
||||
new StringKey("ijklmnop"),
|
||||
new StringKey("qrstuvwx"),
|
||||
new StringKey("yzabcdef")
|
||||
};
|
||||
|
||||
/*
|
||||
* Bloom filters are very sensitive to the number of elements inserted into
|
||||
* them.
|
||||
*
|
||||
* If m denotes the number of bits in the Bloom filter (vectorSize),
|
||||
* n denotes the number of elements inserted into the Bloom filter and
|
||||
* k represents the number of hash functions used (nbHash), then
|
||||
* according to Broder and Mitzenmacher,
|
||||
*
|
||||
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf )
|
||||
*
|
||||
* the probability of false positives is minimized when k is
|
||||
* approximately ln(2) * m/n.
|
||||
*
|
||||
* If we fix the number of hash functions and know the number of entries,
|
||||
* then the optimal vector size m = (k * n) / ln(2)
|
||||
*/
|
||||
final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4;
|
||||
BloomFilter bf = new BloomFilter(
|
||||
(int) Math.ceil(
|
||||
(DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) /
|
||||
Math.log(2.0)),
|
||||
DEFAULT_NUMBER_OF_HASH_FUNCTIONS,
|
||||
Hash.JENKINS_HASH
|
||||
);
|
||||
|
||||
for (int i = 0; i < inserted.length; i++) {
|
||||
bf.add(inserted[i]);
|
||||
}
|
||||
|
||||
// Verify that there are no false negatives and few (if any) false positives
|
||||
|
||||
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||
|
||||
// Test serialization/deserialization
|
||||
|
||||
LOG.info("Checking serialization/deserialization");
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
DataOutputStream out = new DataOutputStream(baos);
|
||||
bf.write(out);
|
||||
ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
|
||||
DataInputStream in = new DataInputStream(bais);
|
||||
bf = new BloomFilter();
|
||||
bf.readFields(in);
|
||||
|
||||
// Verify that there are no false negatives and few (if any) false positives
|
||||
|
||||
checkFalsePositivesNegatives(bf, inserted, notInserted);
|
||||
}
|
||||
|
||||
private void checkFalsePositivesNegatives(BloomFilter bf,
|
||||
StringKey[] inserted, StringKey[] notInserted) {
|
||||
// Test membership for values we inserted. Should not get false negatives
|
||||
|
||||
LOG.info("Checking for false negatives");
|
||||
for (int i = 0; i < inserted.length; i++) {
|
||||
if (!bf.membershipTest(inserted[i])) {
|
||||
LOG.error("false negative for: " + inserted[i]);
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
// Test membership for values we did not insert. It is possible to get
|
||||
// false positives
|
||||
|
||||
LOG.info("Checking for false positives");
|
||||
for (int i = 0; i < notInserted.length; i++) {
|
||||
if(bf.membershipTest(notInserted[i])) {
|
||||
LOG.error("false positive for: " + notInserted[i]);
|
||||
fail();
|
||||
}
|
||||
}
|
||||
LOG.info("Success!");
|
||||
}
|
||||
|
||||
/** Test a CountingBloomFilter
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public void testCountingBloomFilter() throws UnsupportedEncodingException {
|
||||
Filter bf = new CountingBloomFilter(128, 2, Hash.JENKINS_HASH);
|
||||
Key key = new StringKey("toto");
|
||||
Key k2 = new StringKey("lulu");
|
||||
Key k3 = new StringKey("mama");
|
||||
bf.add(key);
|
||||
bf.add(k2);
|
||||
bf.add(k3);
|
||||
assertTrue(bf.membershipTest(key));
|
||||
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
|
||||
assertFalse(bf.membershipTest(new StringKey("abcd")));
|
||||
|
||||
// delete 'key', and check that it is no longer a member
|
||||
((CountingBloomFilter)bf).delete(key);
|
||||
assertFalse(bf.membershipTest(key));
|
||||
|
||||
// to test for overflows, add 'key' enough times to overflow a 4bit bucket,
|
||||
// while asserting that it stays a member
|
||||
for(int i = 0; i < 16; i++){
|
||||
bf.add(key);
|
||||
assertTrue(bf.membershipTest(key));
|
||||
}
|
||||
// test approximateCount
|
||||
CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH);
|
||||
// test the exact range
|
||||
for (int i = 0; i < 8; i++) {
|
||||
bf3.add(key);
|
||||
bf3.add(k2);
|
||||
assertEquals(bf3.approximateCount(key), i + 1);
|
||||
assertEquals(bf3.approximateCount(k2), i + 1);
|
||||
}
|
||||
// test gently degraded counting in high-fill, high error rate filter
|
||||
for (int i = 8; i < 15; i++) {
|
||||
bf3.add(key);
|
||||
assertTrue(bf3.approximateCount(key) >= (i + 1));
|
||||
assertEquals(bf3.approximateCount(k2), 8);
|
||||
assertEquals(bf3.approximateCount(k3), 0);
|
||||
}
|
||||
}
|
||||
|
||||
/** Test a DynamicBloomFilter
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
public void testDynamicBloomFilter() throws UnsupportedEncodingException {
|
||||
Filter bf = new DynamicBloomFilter(128, 2, Hash.JENKINS_HASH, 2);
|
||||
Key key = new StringKey("toto");
|
||||
Key k2 = new StringKey("lulu");
|
||||
Key k3 = new StringKey("mama");
|
||||
bf.add(key);
|
||||
bf.add(k2);
|
||||
bf.add(k3);
|
||||
assertTrue(bf.membershipTest(key));
|
||||
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
|
||||
assertFalse(bf.membershipTest(new StringKey("abcd")));
|
||||
}
|
||||
}//end class
|
Loading…
Reference in New Issue