diff --git a/CHANGES.txt b/CHANGES.txt index a41b47b4182..b96980765d9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,7 @@ Release 0.20.0 - Unreleased HBASE-1342 Add to filesystem info needed to rebuild .META. HBASE-1361 Disable bloom filters HBASE-1367 Get rid of Thrift exception 'NotFound' + HBASE-1381 Remove onelab and bloom filters files from hbase BUG FIXES HBASE-1140 "ant clean test" fails (Nitay Joffe via Stack) diff --git a/src/java/org/apache/hadoop/hbase/io/BloomFilterMapFile.java b/src/java/org/apache/hadoop/hbase/io/BloomFilterMapFile.java deleted file mode 100644 index 86fb72f4921..00000000000 --- a/src/java/org/apache/hadoop/hbase/io/BloomFilterMapFile.java +++ /dev/null @@ -1,260 +0,0 @@ -/** - * Copyright 2008 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.io; - -import java.io.IOException; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.HRegionInfo; -import org.apache.hadoop.hbase.HStoreKey; -import org.apache.hadoop.hbase.util.Hash; -import org.apache.hadoop.hbase.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.onelab.filter.BloomFilter; -import org.onelab.filter.Key; - -/** - * On write, all keys are added to a bloom filter. On read, all keys are - * tested first against bloom filter. Keys are HStoreKey. If passed bloom - * filter is null, just passes invocation to parent. - */ -// TODO should be fixed generic warnings from MapFile methods -@SuppressWarnings("unchecked") -public class BloomFilterMapFile extends HBaseMapFile { - static final Log LOG = LogFactory.getLog(BloomFilterMapFile.class); - protected static final String BLOOMFILTER_FILE_NAME = "filter"; - - public static class Reader extends HBaseReader { - private final BloomFilter bloomFilter; - - /** - * @param fs - * @param dirName - * @param conf - * @param filter - * @param blockCacheEnabled - * @param hri - * @throws IOException - */ - public Reader(FileSystem fs, String dirName, Configuration conf, - final boolean filter, final boolean blockCacheEnabled, - HRegionInfo hri) - throws IOException { - super(fs, dirName, conf, blockCacheEnabled, hri); - if (filter) { - this.bloomFilter = loadBloomFilter(fs, dirName); - } else { - this.bloomFilter = null; - } - } - - private BloomFilter loadBloomFilter(FileSystem fs, String dirName) - throws IOException { - Path filterFile = new Path(dirName, BLOOMFILTER_FILE_NAME); - if(!fs.exists(filterFile)) { - LOG.warn("FileNotFound: " + filterFile + "; proceeding without"); - return null; - } - BloomFilter filter = new BloomFilter(); - FSDataInputStream in = fs.open(filterFile); - try { - filter.readFields(in); - } finally { - in.close(); - } - return filter; - } - - /** - * @see org.apache.hadoop.hbase.io.MapFile.Reader#get(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable) - */ - @Override - public Writable get(WritableComparable key, Writable val) - throws IOException { - if (bloomFilter == null) { - return super.get(key, val); - } - if(bloomFilter.membershipTest(getBloomFilterKey(key))) { - if (LOG.isDebugEnabled()) { - LOG.debug("bloom filter reported that key exists"); - } - return super.get(key, val); - } - if (LOG.isDebugEnabled()) { - LOG.debug("bloom filter reported that key does not exist"); - } - return null; - } - - /** - * @see org.apache.hadoop.hbase.io.MapFile.Reader#getClosest(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable) - */ - @Override - public WritableComparable getClosest(WritableComparable key, - Writable val) throws IOException { - if (bloomFilter == null) { - return super.getClosest(key, val); - } - // Note - the key being passed to us is always a HStoreKey - if(bloomFilter.membershipTest(getBloomFilterKey(key))) { - if (LOG.isDebugEnabled()) { - LOG.debug("bloom filter reported that key exists"); - } - return super.getClosest(key, val); - } - if (LOG.isDebugEnabled()) { - LOG.debug("bloom filter reported that key does not exist"); - } - return null; - } - - /** - * @return size of the bloom filter - */ - public int getBloomFilterSize() { - return bloomFilter == null ? 0 : bloomFilter.getVectorSize(); - } - } - - public static class Writer extends HBaseWriter { - private static final double DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4.0; - private final BloomFilter bloomFilter; - private final String dirName; - private final FileSystem fs; - - /** - * @param conf - * @param fs - * @param dirName - * @param compression - * @param filter - * @param nrows - * @param hri - * @throws IOException - */ - public Writer(Configuration conf, FileSystem fs, String dirName, - SequenceFile.CompressionType compression, final boolean filter, - int nrows, final HRegionInfo hri) - throws IOException { - super(conf, fs, dirName, compression, hri); - this.dirName = dirName; - this.fs = fs; - if (filter) { - /* - * There is no way to automatically determine the vector size and the - * number of hash functions to use. In particular, bloom filters are - * very sensitive to the number of elements inserted into them. For - * HBase, the number of entries depends on the size of the data stored - * in the column. Currently the default region size is 256MB, so the - * number of entries is approximately - * 256MB / (average value size for column). - * - * If m denotes the number of bits in the Bloom filter (vectorSize), - * n denotes the number of elements inserted into the Bloom filter and - * k represents the number of hash functions used (nbHash), then - * according to Broder and Mitzenmacher, - * - * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf ) - * - * the probability of false positives is minimized when k is - * approximately m/n ln(2). - * - * If we fix the number of hash functions and know the number of - * entries, then the optimal vector size m = (k * n) / ln(2) - */ - BloomFilter f = null; - try { - f = new BloomFilter( - (int) Math.ceil( - (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * nrows)) / - Math.log(2.0)), - (int) DEFAULT_NUMBER_OF_HASH_FUNCTIONS, - Hash.getHashType(conf) - ); - } catch (IllegalArgumentException e) { - LOG.warn("Failed creating bloomfilter; proceeding without", e); - } - this.bloomFilter = f; - } else { - this.bloomFilter = null; - } - } - - /** - * @see org.apache.hadoop.hbase.io.MapFile.Writer#append(org.apache.hadoop.io.WritableComparable, org.apache.hadoop.io.Writable) - */ - @Override - public void append(WritableComparable key, Writable val) - throws IOException { - if (bloomFilter != null) { - bloomFilter.add(getBloomFilterKey(key)); - } - super.append(key, val); - } - - /** - * @see org.apache.hadoop.hbase.io.MapFile.Writer#close() - */ - @Override - public synchronized void close() throws IOException { - super.close(); - if (this.bloomFilter != null) { - flushBloomFilter(); - } - } - - /** - * Flushes bloom filter to disk - * - * @throws IOException - */ - private void flushBloomFilter() throws IOException { - if (LOG.isDebugEnabled()) { - LOG.debug("flushing bloom filter for " + this.dirName); - } - FSDataOutputStream out = - fs.create(new Path(dirName, BLOOMFILTER_FILE_NAME)); - try { - bloomFilter.write(out); - } finally { - out.close(); - } - if (LOG.isDebugEnabled()) { - LOG.debug("flushed bloom filter for " + this.dirName); - } - } - } - - /** - * Custom bloom filter key maker. - * @param key - * @return Key made of bytes of row only. - */ - protected static Key getBloomFilterKey(WritableComparable key) { - return new Key(((HStoreKey) key).getRow()); - } -} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/io/HalfMapFileReader.java b/src/java/org/apache/hadoop/hbase/io/HalfMapFileReader.java index d94536b6857..aff3fc00042 100644 --- a/src/java/org/apache/hadoop/hbase/io/HalfMapFileReader.java +++ b/src/java/org/apache/hadoop/hbase/io/HalfMapFileReader.java @@ -44,7 +44,7 @@ import org.apache.hadoop.io.WritableComparable; *

This file is not splitable. Calls to {@link #midKey()} return null. */ //TODO should be fixed generic warnings from MapFile methods -public class HalfMapFileReader extends BloomFilterMapFile.Reader { +public class HalfMapFileReader extends HBaseMapFile.HBaseReader { private final boolean top; private final HStoreKey midkey; private boolean firstNextCall = true; @@ -63,7 +63,7 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader { final WritableComparable mk, final HRegionInfo hri) throws IOException { - this(fs, dirName, conf, r, mk, false, false, hri); + this(fs, dirName, conf, r, mk, false, hri); } /** @@ -72,18 +72,17 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader { * @param conf * @param r * @param mk - * @param filter * @param blockCacheEnabled * @param hri * @throws IOException */ public HalfMapFileReader(final FileSystem fs, final String dirName, final Configuration conf, final Range r, - final WritableComparable mk, final boolean filter, + final WritableComparable mk, final boolean blockCacheEnabled, final HRegionInfo hri) throws IOException { - super(fs, dirName, conf, filter, blockCacheEnabled, hri); + super(fs, dirName, conf, blockCacheEnabled, hri); // This is not actual midkey for this half-file; its just border // around which we split top and bottom. Have to look in files to find // actual last and first keys for bottom and top halves. Half-files don't @@ -211,4 +210,4 @@ public class HalfMapFileReader extends BloomFilterMapFile.Reader { checkKey(key); return super.seek(key); } -} \ No newline at end of file +} diff --git a/src/java/org/apache/hadoop/hbase/util/Hash.java b/src/java/org/apache/hadoop/hbase/util/Hash.java index fac81645e63..d5a5e8a3d22 100644 --- a/src/java/org/apache/hadoop/hbase/util/Hash.java +++ b/src/java/org/apache/hadoop/hbase/util/Hash.java @@ -24,9 +24,6 @@ import org.apache.hadoop.conf.Configuration; * This class represents a common API for hashing functions. */ public abstract class Hash { - // TODO: Fix the design tangle that has classes over in org.onelab.filter - // referring to this class. Would need to also move the Jenkins and Murmur - // hashing function too. /** Constant to denote invalid hash type. */ public static final int INVALID_HASH = -1; /** Constant to denote {@link JenkinsHash}. */ diff --git a/src/java/org/onelab/filter/BloomFilter.java b/src/java/org/onelab/filter/BloomFilter.java deleted file mode 100644 index 061774f8656..00000000000 --- a/src/java/org/onelab/filter/BloomFilter.java +++ /dev/null @@ -1,238 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.BitSet; - - -/** - * Implements a Bloom filter, as defined by Bloom in 1970. - *

- * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by - * the networking research community in the past decade thanks to the bandwidth efficiencies that it - * offers for the transmission of set membership information between networked hosts. A sender encodes - * the information into a bit vector, the Bloom filter, that is more compact than a conventional - * representation. Computation and space costs for construction are linear in the number of elements. - * The receiver uses the filter to test whether various elements are members of the set. Though the - * filter will occasionally return a false positive, it will never return a false negative. When creating - * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 2 Feb. 07 - * - * @see org.onelab.filter.Filter The general behavior of a filter - * - * @see Space/Time Trade-Offs in Hash Coding with Allowable Errors - */ -public class BloomFilter extends Filter { - private static final byte[] bitvalues = new byte[] { - (byte)0x01, - (byte)0x02, - (byte)0x04, - (byte)0x08, - (byte)0x10, - (byte)0x20, - (byte)0x40, - (byte)0x80 - }; - - /** The bit vector. */ - BitSet bits; - - /** Default constructor - use with readFields */ - public BloomFilter() { - super(); - } - - /** - * Constructor - * @param vectorSize The vector size of this filter. - * @param nbHash The number of hash function to consider. - * @param hashType type of the hashing function (see {@link Hash}). - */ - public BloomFilter(int vectorSize, int nbHash, int hashType){ - super(vectorSize, nbHash, hashType); - - bits = new BitSet(this.vectorSize); - }//end constructor - - @Override - public void add(Key key) { - if(key == null) { - throw new NullPointerException("key cannot be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - bits.set(h[i]); - } - }//end add() - - @Override - public void and(Filter filter){ - if(filter == null - || !(filter instanceof BloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be and-ed"); - } - - this.bits.and(((BloomFilter) filter).bits); - }//end and() - - @Override - public boolean membershipTest(Key key){ - if(key == null) { - throw new NullPointerException("key cannot be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - for(int i = 0; i < nbHash; i++) { - if(!bits.get(h[i])) { - return false; - } - } - return true; - }//end memberhsipTest() - - @Override - public void not(){ - bits.flip(0, vectorSize - 1); - }//end not() - - @Override - public void or(Filter filter){ - if(filter == null - || !(filter instanceof BloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be or-ed"); - } - bits.or(((BloomFilter) filter).bits); - }//end or() - - @Override - public void xor(Filter filter){ - if(filter == null - || !(filter instanceof BloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be xor-ed"); - } - bits.xor(((BloomFilter) filter).bits); - }//and xor() - - @Override - public String toString(){ - return bits.toString(); - }//end toString() - - @Override - public Object clone(){ - BloomFilter bf = new BloomFilter(vectorSize, nbHash, hashType); - bf.or(this); - return bf; - }//end clone() - - /** - * @return size of the the bloomfilter - */ - public int getVectorSize() { - return this.vectorSize; - } - - // Writable - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - byte[] bytes = new byte[getNBytes()]; - for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { - if (bitIndex == 8) { - bitIndex = 0; - byteIndex++; - } - if (bitIndex == 0) { - bytes[byteIndex] = 0; - } - if (bits.get(i)) { - bytes[byteIndex] |= bitvalues[bitIndex]; - } - } - out.write(bytes); - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - bits = new BitSet(this.vectorSize); - byte[] bytes = new byte[getNBytes()]; - in.readFully(bytes); - for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { - if (bitIndex == 8) { - bitIndex = 0; - byteIndex++; - } - if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { - bits.set(i); - } - } - } - - /* @return number of bytes needed to hold bit vector */ - private int getNBytes() { - return (vectorSize + 7) / 8; - } -}//end class diff --git a/src/java/org/onelab/filter/CountingBloomFilter.java b/src/java/org/onelab/filter/CountingBloomFilter.java deleted file mode 100644 index d4060ed1952..00000000000 --- a/src/java/org/onelab/filter/CountingBloomFilter.java +++ /dev/null @@ -1,309 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Implements a counting Bloom filter, as defined by Fan et al. in a ToN - * 2000 paper. - *

- * A counting Bloom filter is an improvement to standard a Bloom filter as it - * allows dynamic additions and deletions of set membership information. This - * is achieved through the use of a counting vector instead of a bit vector. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.1 - 19 Jan. 08 - * - * @see org.onelab.filter.Filter The general behavior of a filter - * - * @see Summary cache: a scalable wide-area web cache sharing protocol - */ -public final class CountingBloomFilter extends Filter { - /** Storage for the counting buckets */ - private long[] buckets; - - /** We are using 4bit buckets, so each bucket can count to 15 */ - private final static long BUCKET_MAX_VALUE = 15; - - /** Default constructor - use with readFields */ - public CountingBloomFilter() {} - - /** - * Constructor - * @param vectorSize The vector size of this filter. - * @param nbHash The number of hash function to consider. - * @param hashType type of the hashing function (see {@link Hash}). - */ - public CountingBloomFilter(int vectorSize, int nbHash, int hashType){ - super(vectorSize, nbHash, hashType); - buckets = new long[buckets2words(vectorSize)]; - }//end constructor - - /** returns the number of 64 bit words it would take to hold vectorSize buckets */ - private static int buckets2words(int vectorSize) { - return ((vectorSize - 1) >>> 4) + 1; - } - - - @Override - public void add(Key key) { - if(key == null) { - throw new NullPointerException("key can not be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - // find the bucket - int wordNum = h[i] >> 4; // div 16 - int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 - - long bucketMask = 15L << bucketShift; - long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; - - // only increment if the count in the bucket is less than BUCKET_MAX_VALUE - if(bucketValue < BUCKET_MAX_VALUE) { - // increment by 1 - buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift); - } - } - }//end add() - - /** - * Removes a specified key from this counting Bloom filter. - *

- * Invariant: nothing happens if the specified key does not belong to this counter Bloom filter. - * @param key The key to remove. - */ - public void delete(Key key) { - if(key == null) { - throw new NullPointerException("Key may not be null"); - } - if(!membershipTest(key)) { - throw new IllegalArgumentException("Key is not a member"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - // find the bucket - int wordNum = h[i] >> 4; // div 16 - int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 - - long bucketMask = 15L << bucketShift; - long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; - - // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE - if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) { - // decrement by 1 - buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift); - } - } - }//end delete - - @Override - public void and(Filter filter){ - if(filter == null - || !(filter instanceof CountingBloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be and-ed"); - } - CountingBloomFilter cbf = (CountingBloomFilter)filter; - - int sizeInWords = buckets2words(vectorSize); - for(int i = 0; i < sizeInWords; i++) { - this.buckets[i] &= cbf.buckets[i]; - } - }//end and() - - @Override - public boolean membershipTest(Key key){ - if(key == null) { - throw new NullPointerException("Key may not be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - // find the bucket - int wordNum = h[i] >> 4; // div 16 - int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 - - long bucketMask = 15L << bucketShift; - - if((buckets[wordNum] & bucketMask) == 0) { - return false; - } - } - - return true; - }//end membershipTest() - - /** - * This method calculates an approximate count of the key, i.e. how many - * times the key was added to the filter. This allows the filter to be - * used as an approximate key -> count map. - *

NOTE: due to the bucket size of this filter, inserting the same - * key more than 15 times will cause an overflow at all filter positions - * associated with this key, and it will significantly increase the error - * rate for this and other keys. For this reason the filter can only be - * used to store small count values 0 <= N << 15. - * @param key key to be tested - * @return 0 if the key is not present. Otherwise, a positive value v will - * be returned such that v == count with probability equal to the - * error rate of this filter, and v > count otherwise. - * Additionally, if the filter experienced an underflow as a result of - * {@link #delete(Key)} operation, the return value may be lower than the - * count with the probability of the false negative rate of such - * filter. - */ - public int approximateCount(Key key) { - int res = Integer.MAX_VALUE; - int[] h = hash.hash(key); - hash.clear(); - for (int i = 0; i < nbHash; i++) { - // find the bucket - int wordNum = h[i] >> 4; // div 16 - int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 - - long bucketMask = 15L << bucketShift; - long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; - if (bucketValue < res) res = (int)bucketValue; - } - if (res != Integer.MAX_VALUE) { - return res; - } - return 0; - } - - @Override - public void not(){ - throw new UnsupportedOperationException("not() is undefined for " - + this.getClass().getName()); - }//end not() - - @Override - public void or(Filter filter){ - if(filter == null - || !(filter instanceof CountingBloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be or-ed"); - } - - CountingBloomFilter cbf = (CountingBloomFilter)filter; - - int sizeInWords = buckets2words(vectorSize); - for(int i = 0; i < sizeInWords; i++) { - this.buckets[i] |= cbf.buckets[i]; - } - }//end or() - - @Override - public void xor(Filter filter){ - throw new UnsupportedOperationException("xor() is undefined for " - + this.getClass().getName()); - }//end xor() - - @Override - public String toString(){ - StringBuilder res = new StringBuilder(); - - for(int i = 0; i < vectorSize; i++) { - if(i > 0) { - res.append(" "); - } - - int wordNum = i >> 4; // div 16 - int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4 - - long bucketMask = 15L << bucketShift; - long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; - - res.append(bucketValue); - } - - return res.toString(); - }//end toString() - - @Override - public Object clone(){ - CountingBloomFilter cbf = new CountingBloomFilter(vectorSize, nbHash, hashType); - cbf.buckets = this.buckets.clone(); - return cbf; - } - - // Writable - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - int sizeInWords = buckets2words(vectorSize); - for(int i = 0; i < sizeInWords; i++) { - out.writeLong(buckets[i]); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - int sizeInWords = buckets2words(vectorSize); - buckets = new long[sizeInWords]; - for(int i = 0; i < sizeInWords; i++) { - buckets[i] = in.readLong(); - } - } -} \ No newline at end of file diff --git a/src/java/org/onelab/filter/DynamicBloomFilter.java b/src/java/org/onelab/filter/DynamicBloomFilter.java deleted file mode 100644 index d2a3f0277a5..00000000000 --- a/src/java/org/onelab/filter/DynamicBloomFilter.java +++ /dev/null @@ -1,301 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -/** - * Implements a dynamic Bloom filter, as defined in the INFOCOM 2006 paper. - *

- * A dynamic Bloom filter (DBF) makes use of a s * m bit matrix but - * each of the s rows is a standard Bloom filter. The creation - * process of a DBF is iterative. At the start, the DBF is a 1 * m - * bit matrix, i.e., it is composed of a single standard Bloom filter. - * It assumes that nr elements are recorded in the - * initial bit vector, where nr <= n (n is - * the cardinality of the set A to record in the filter). - *

- * As the size of A grows during the execution of the application, - * several keys must be inserted in the DBF. When inserting a key into the DBF, - * one must first get an active Bloom filter in the matrix. A Bloom filter is - * active when the number of recorded keys, nr, is - * strictly less than the current cardinality of A, n. - * If an active Bloom filter is found, the key is inserted and - * nr is incremented by one. On the other hand, if there - * is no active Bloom filter, a new one is created (i.e., a new row is added to - * the matrix) according to the current size of A and the element - * is added in this new Bloom filter and the nr value of - * this new Bloom filter is set to one. A given key is said to belong to the - * DBF if the k positions are set to one in one of the matrix rows. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 6 Feb. 07 - * - * @see org.onelab.filter.Filter The general behavior of a filter - * @see org.onelab.filter.BloomFilter A Bloom filter - * - * @see Theory and Network Applications of Dynamic Bloom Filters - */ -public class DynamicBloomFilter extends Filter { - /** - * Threshold for the maximum number of key to record in a dynamic Bloom filter row. - */ - private int nr; - - /** - * The number of keys recorded in the current standard active Bloom filter. - */ - private int currentNbRecord; - - /** - * The matrix of Bloom filter. - */ - private BloomFilter[] matrix; - - /** - * Zero-args constructor for the serialization. - */ - public DynamicBloomFilter() { } - - /** - * Constructor. - *

- * Builds an empty Dynamic Bloom filter. - * @param vectorSize The number of bits in the vector. - * @param nbHash The number of hash function to consider. - * @param hashType type of the hashing function (see {@link Hash}). - * @param nr The threshold for the maximum number of keys to record in a dynamic Bloom filter row. - */ - public DynamicBloomFilter(int vectorSize, int nbHash, int hashType, int nr) { - super(vectorSize, nbHash, hashType); - - this.nr = nr; - this.currentNbRecord = 0; - - matrix = new BloomFilter[1]; - matrix[0] = new BloomFilter(this.vectorSize, this.nbHash, this.hashType); - }//end constructor - - @Override - public void add(Key key){ - if(key == null) { - throw new NullPointerException("Key can not be null"); - } - - BloomFilter bf = getActiveStandardBF(); - - if(bf == null){ - addRow(); - bf = matrix[matrix.length - 1]; - currentNbRecord = 0; - } - - bf.add(key); - - currentNbRecord++; - }//end add() - - @Override - public void and(Filter filter) { - if(filter == null - || !(filter instanceof DynamicBloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be and-ed"); - } - - DynamicBloomFilter dbf = (DynamicBloomFilter)filter; - - if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) { - throw new IllegalArgumentException("filters cannot be and-ed"); - } - - for(int i = 0; i < matrix.length; i++) { - matrix[i].and(dbf.matrix[i]); - } - }//end and() - - @Override - public boolean membershipTest(Key key){ - if(key == null) { - return true; - } - - for(int i = 0; i < matrix.length; i++) { - if(matrix[i].membershipTest(key)) { - return true; - } - } - - return false; - }//end membershipTest() - - @Override - public void not(){ - for(int i = 0; i < matrix.length; i++) { - matrix[i].not(); - } - }//end not() - - @Override - public void or(Filter filter){ - if(filter == null - || !(filter instanceof DynamicBloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be or-ed"); - } - - DynamicBloomFilter dbf = (DynamicBloomFilter)filter; - - if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) { - throw new IllegalArgumentException("filters cannot be or-ed"); - } - for(int i = 0; i < matrix.length; i++) { - matrix[i].or(dbf.matrix[i]); - } - }//end or() - - @Override - public void xor(Filter filter){ - if(filter == null - || !(filter instanceof DynamicBloomFilter) - || filter.vectorSize != this.vectorSize - || filter.nbHash != this.nbHash) { - throw new IllegalArgumentException("filters cannot be xor-ed"); - } - DynamicBloomFilter dbf = (DynamicBloomFilter)filter; - - if(dbf.matrix.length != this.matrix.length || dbf.nr != this.nr) { - throw new IllegalArgumentException("filters cannot be xor-ed"); - } - - for(int i = 0; ithis dynamic Bloom filter. - */ - private void addRow(){ - BloomFilter[] tmp = new BloomFilter[matrix.length + 1]; - - for(int i = 0; i < matrix.length; i++) { - tmp[i] = (BloomFilter)matrix[i].clone(); - } - - tmp[tmp.length-1] = new BloomFilter(vectorSize, nbHash, hashType); - - matrix = tmp; - }//end addRow() - - /** - * Returns the active standard Bloom filter in this dynamic Bloom filter. - * @return BloomFilter The active standard Bloom filter. - * Null otherwise. - */ - private BloomFilter getActiveStandardBF() { - if(currentNbRecord >= nr) { - return null; - } - - return matrix[matrix.length - 1]; - }//end getActiveStandardBF() -}//end class diff --git a/src/java/org/onelab/filter/Filter.java b/src/java/org/onelab/filter/Filter.java deleted file mode 100644 index 8737826e4b9..00000000000 --- a/src/java/org/onelab/filter/Filter.java +++ /dev/null @@ -1,216 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 - * (http://www.one-lab.org) - * - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -import org.apache.hadoop.hbase.util.Hash; -import org.apache.hadoop.io.Writable; - -/** - * Defines the general behavior of a filter. - *

- * A filter is a data structure which aims at offering a lossy summary of a set A. The - * key idea is to map entries of A (also called keys) into several positions - * in a vector through the use of several hash functions. - *

- * Typically, a filter will be implemented as a Bloom filter (or a Bloom filter extension). - *

- * It must be extended in order to define the real behavior. - * - * @see org.onelab.filter.Filter The general behavior of a filter - * - * @version 1.0 - 2 Feb. 07 - * - * @see org.onelab.filter.Key The general behavior of a key - * @see org.onelab.filter.HashFunction A hash function - */ -public abstract class Filter implements Writable { - private static final int VERSION = -1; // negative to accommodate for old format - /** The vector size of this filter. */ - protected int vectorSize; - - /** The hash function used to map a key to several positions in the vector. */ - protected HashFunction hash; - - /** The number of hash function to consider. */ - protected int nbHash; - - /** Type of hashing function to use. */ - protected int hashType; - - protected Filter() {} - - /** - * Constructor. - * @param vectorSize The vector size of this filter. - * @param nbHash The number of hash functions to consider. - * @param hashType type of the hashing function (see {@link Hash}). - */ - protected Filter(int vectorSize, int nbHash, int hashType) { - this.vectorSize = vectorSize; - this.nbHash = nbHash; - this.hashType = hashType; - this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType); - }//end constructor - - /** - * Adds a key to this filter. - * @param key The key to add. - */ - public abstract void add(Key key); - - /** - * Determines wether a specified key belongs to this filter. - * @param key The key to test. - * @return boolean True if the specified key belongs to this filter. - * False otherwise. - */ - public abstract boolean membershipTest(Key key); - - /** - * Peforms a logical AND between this filter and a specified filter. - *

- * Invariant: The result is assigned to this filter. - * @param filter The filter to AND with. - */ - public abstract void and(Filter filter); - - /** - * Peforms a logical OR between this filter and a specified filter. - *

- * Invariant: The result is assigned to this filter. - * @param filter The filter to OR with. - */ - public abstract void or(Filter filter); - - /** - * Peforms a logical XOR between this filter and a specified filter. - *

- * Invariant: The result is assigned to this filter. - * @param filter The filter to XOR with. - */ - public abstract void xor(Filter filter); - - /** - * Performs a logical NOT on this filter. - *

- * The result is assigned to this filter. - */ - public abstract void not(); - - /** - * Adds a list of keys to this filter. - * @param keys The list of keys. - */ - public void add(List keys){ - if(keys == null) { - throw new IllegalArgumentException("ArrayList may not be null"); - } - - for(Key key: keys) { - add(key); - } - }//end add() - - /** - * Adds a collection of keys to this filter. - * @param keys The collection of keys. - */ - public void add(Collection keys){ - if(keys == null) { - throw new IllegalArgumentException("Collection may not be null"); - } - for(Key key: keys) { - add(key); - } - }//end add() - - /** - * Adds an array of keys to this filter. - * @param keys The array of keys. - */ - public void add(Key[] keys){ - if(keys == null) { - throw new IllegalArgumentException("Key[] may not be null"); - } - for(int i = 0; i < keys.length; i++) { - add(keys[i]); - } - }//end add() - - // Writable interface - - public void write(DataOutput out) throws IOException { - out.writeInt(VERSION); - out.writeInt(this.nbHash); - out.writeByte(this.hashType); - out.writeInt(this.vectorSize); - } - - public void readFields(DataInput in) throws IOException { - int ver = in.readInt(); - if (ver > 0) { // old unversioned format - this.nbHash = ver; - this.hashType = Hash.JENKINS_HASH; - } else if (ver == VERSION) { - this.nbHash = in.readInt(); - this.hashType = in.readByte(); - } else { - throw new IOException("Unsupported version: " + ver); - } - this.vectorSize = in.readInt(); - this.hash = new HashFunction(this.vectorSize, this.nbHash, this.hashType); - } -}//end class diff --git a/src/java/org/onelab/filter/HashFunction.java b/src/java/org/onelab/filter/HashFunction.java deleted file mode 100644 index cf97c7bcaa2..00000000000 --- a/src/java/org/onelab/filter/HashFunction.java +++ /dev/null @@ -1,127 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 - * (http://www.one-lab.org) - * - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import org.apache.hadoop.hbase.util.Hash; - -/** - * Implements a hash object that returns a certain number of hashed values. - *

- * It is based on the SHA-1 algorithm. - * - * @see org.onelab.filter.Filter The general behavior of a filter - * - * @version 1.0 - 2 Feb. 07 - * - * @see org.onelab.filter.Key The general behavior of a key being stored in a filter - * @see org.onelab.filter.Filter The general behavior of a filter - * - * @see SHA-1 algorithm - */ -public final class HashFunction { - /** The number of hashed values. */ - private int nbHash; - - /** The maximum highest returned value. */ - private int maxValue; - - /** Hashing algorithm to use. */ - private Hash hashFunction; - - /** - * Constructor. - *

- * Builds a hash function that must obey to a given maximum number of returned values and a highest value. - * @param maxValue The maximum highest returned value. - * @param nbHash The number of resulting hashed values. - * @param hashType type of the hashing function (see {@link Hash}). - */ - public HashFunction(int maxValue, int nbHash, int hashType) { - if(maxValue <= 0) { - throw new IllegalArgumentException("maxValue must be > 0"); - } - - if(nbHash <= 0) { - throw new IllegalArgumentException("nbHash must be > 0"); - } - - this.maxValue = maxValue; - this.nbHash = nbHash; - this.hashFunction = Hash.getInstance(hashType); - if (this.hashFunction == null) - throw new IllegalArgumentException("hashType must be known"); - }//end constructor - - /** Clears this hash function. A NOOP */ - public void clear() { - } - - /** - * Hashes a specified key into several integers. - * @param k The specified key. - * @return The array of hashed values. - */ - public int[] hash(Key k){ - byte[] b = k.getBytes(); - if(b == null) { - throw new NullPointerException("buffer reference is null"); - } - if(b.length == 0) { - throw new IllegalArgumentException("key length must be > 0"); - } - int[] result = new int[nbHash]; - for (int i = 0, initval = 0; i < nbHash; i++) { - initval = hashFunction.hash(b, initval); - result[i] = Math.abs(initval) % maxValue; - } - return result; - }//end hash() - -}//end class diff --git a/src/java/org/onelab/filter/Key.java b/src/java/org/onelab/filter/Key.java deleted file mode 100644 index 4b4274df504..00000000000 --- a/src/java/org/onelab/filter/Key.java +++ /dev/null @@ -1,174 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.io.WritableComparable; - -/** - * The general behavior of a key that must be stored in a filter. - * - * @see org.onelab.filter.Filter The general behavior of a filter - */ -public class Key implements WritableComparable { - /** Byte value of key */ - byte[] bytes; - - /** - * The weight associated to this key. - *

- * Invariant: if it is not specified, each instance of - * Key will have a default weight of 1.0 - */ - double weight; - - /** default constructor - use with readFields */ - public Key() {} - - /** - * Constructor. - *

- * Builds a key with a default weight. - * @param value The byte value of this key. - */ - public Key(byte[] value) { - this(value, 1.0); - }//end constructor - - /** - * Constructor. - *

- * Builds a key with a specified weight. - * @param value The value of this key. - * @param weight The weight associated to this key. - */ - public Key(byte[] value, double weight) { - set(value, weight); - }//end constructor - - /** - * @param value - * @param weight - */ - public void set(byte[] value, double weight) { - if(value == null) { - throw new IllegalArgumentException("value can not be null"); - } - this.bytes = value; - this.weight = weight; - } - - /** @return byte[] The value of this key. */ - public byte[] getBytes() { - return this.bytes; - } - - /** @return Returns the weight associated to this key. */ - public double getWeight(){ - return weight; - }//end getWeight() - - /** - * Increments the weight of this key with a specified value. - * @param weight The increment. - */ - public void incrementWeight(double weight){ - this.weight += weight; - }//end incrementWeight() - - /** Increments the weight of this key by one. */ - public void incrementWeight(){ - this.weight++; - }//end incrementWeight() - - @Override - public boolean equals(Object o) { - return this.compareTo((Key)o) == 0; - } - - @Override - public int hashCode() { - int result = 0; - for(int i = 0; i < bytes.length; i++) { - result ^= Byte.valueOf(bytes[i]).hashCode(); - } - result ^= Double.valueOf(weight).hashCode(); - return result; - } - - // Writable - - public void write(DataOutput out) throws IOException { - out.writeInt(bytes.length); - out.write(bytes); - out.writeDouble(weight); - } - - public void readFields(DataInput in) throws IOException { - this.bytes = new byte[in.readInt()]; - in.readFully(this.bytes); - weight = in.readDouble(); - } - - // Comparable - - public int compareTo(Key o) { - int result = this.bytes.length - o.getBytes().length; - for(int i = 0; result == 0 && i < bytes.length; i++) { - result = this.bytes[i] - o.bytes[i]; - } - - if(result == 0) { - result = Double.valueOf(this.weight - o.weight).intValue(); - } - return result; - } -}//end class diff --git a/src/java/org/onelab/filter/RemoveScheme.java b/src/java/org/onelab/filter/RemoveScheme.java deleted file mode 100644 index be6ea7d88fc..00000000000 --- a/src/java/org/onelab/filter/RemoveScheme.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 - * (http://www.one-lab.org) - * - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -/** - * Defines the different remove scheme for retouched Bloom filters. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 7 Feb. 07 - */ -public interface RemoveScheme { - /** - * Random selection. - *

- * The idea is to randomly select a bit to reset. - */ - public final static short RANDOM = 0; - - /** - * MinimumFN Selection. - *

- * The idea is to select the bit to reset that will generate the minimum - * number of false negative. - */ - public final static short MINIMUM_FN = 1; - - /** - * MaximumFP Selection. - *

- * The idea is to select the bit to reset that will remove the maximum number - * of false positive. - */ - public final static short MAXIMUM_FP = 2; - - /** - * Ratio Selection. - *

- * The idea is to select the bit to reset that will, at the same time, remove - * the maximum number of false positve while minimizing the amount of false - * negative generated. - */ - public final static short RATIO = 3; -}//end interface diff --git a/src/java/org/onelab/filter/RetouchedBloomFilter.java b/src/java/org/onelab/filter/RetouchedBloomFilter.java deleted file mode 100644 index 6a66943b3e4..00000000000 --- a/src/java/org/onelab/filter/RetouchedBloomFilter.java +++ /dev/null @@ -1,448 +0,0 @@ -/** - * - * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.filter; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Random; - -/** - * Implements a retouched Bloom filter, as defined in the CoNEXT 2006 paper. - *

- * It allows the removal of selected false positives at the cost of introducing - * random false negatives, and with the benefit of eliminating some random false - * positives at the same time. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 7 Feb. 07 - * - * @see org.onelab.filter.Filter The general behavior of a filter - * @see org.onelab.filter.BloomFilter A Bloom filter - * @see org.onelab.filter.RemoveScheme The different selective clearing algorithms - * - * @see Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives - */ -public final class RetouchedBloomFilter extends BloomFilter -implements RemoveScheme { - /** - * KeyList vector (or ElementList Vector, as defined in the paper) of false positives. - */ - List[] fpVector; - - /** - * KeyList vector of keys recorded in the filter. - */ - List[] keyVector; - - /** - * Ratio vector. - */ - double[] ratio; - - private Random rand; - - /** Default constructor - use with readFields */ - public RetouchedBloomFilter() {} - - /** - * Constructor - * @param vectorSize The vector size of this filter. - * @param nbHash The number of hash function to consider. - * @param hashType type of the hashing function (see {@link Hash}). - */ - public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) { - super(vectorSize, nbHash, hashType); - - this.rand = null; - createVector(); - }//end constructor - - @Override - public void add(Key key){ - if(key == null) { - throw new NullPointerException("key can not be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - bits.set(h[i]); - keyVector[h[i]].add(key); - }//end for - i - }//end add() - - /** - * Adds a false positive information to this retouched Bloom filter. - *

- * Invariant: if the false positive is null, nothing happens. - * @param key The false positive key to add. - */ - public void addFalsePositive(Key key){ - if(key == null) { - throw new NullPointerException("key can not be null"); - } - - int[] h = hash.hash(key); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - fpVector[h[i]].add(key); - } - }//end addFalsePositive() - - /** - * Adds a collection of false positive information to this retouched Bloom filter. - * @param coll The collection of false positive. - */ - public void addFalsePositive(Collection coll) { - if(coll == null) { - throw new NullPointerException("Collection can not be null"); - } - - for(Key k: coll) { - addFalsePositive(k); - } - }//end addFalsePositive() - - /** - * Adds a list of false positive information to this retouched Bloom filter. - * @param keys The list of false positive. - */ - public void addFalsePositive(List keys){ - if(keys == null) { - throw new NullPointerException("ArrayList can not be null"); - } - - for(Key k: keys) { - addFalsePositive(k); - } - }//end addFalsePositive() - - /** - * Adds an array of false positive information to this retouched Bloom filter. - * @param keys The array of false positive. - */ - public void addFalsePositive(Key[] keys){ - if(keys == null) { - throw new NullPointerException("Key[] can not be null"); - } - - for(int i = 0; i < keys.length; i++) { - addFalsePositive(keys[i]); - } - }//end addFalsePositive() - - /** - * Performs the selective clearing for a given key. - * @param k The false positive key to remove from this retouched Bloom filter. - * @param scheme The selective clearing scheme to apply. - */ - public void selectiveClearing(Key k, short scheme) { - if(k == null) { - throw new NullPointerException("Key can not be null"); - } - - if(!membershipTest(k)) { - throw new IllegalArgumentException("Key is not a member"); - } - - int index = 0; - int[] h = hash.hash(k); - - switch(scheme) { - - case RANDOM: - index = randomRemove(); - break; - - case MINIMUM_FN: - index = minimumFnRemove(h); - break; - - case MAXIMUM_FP: - index = maximumFpRemove(h); - break; - - case RATIO: - index = ratioRemove(h); - break; - - default: - throw new AssertionError("Undefined selective clearing scheme"); - - }//end switch - - clearBit(index); - }//end selectiveClearing() - - private int randomRemove() { - if(rand == null) { - rand = new Random(); - } - - return rand.nextInt(nbHash); - }//end randomRemove() - - /** - * Chooses the bit position that minimizes the number of false negative generated. - * @param h The different bit positions. - * @return int The position that minimizes the number of false negative generated. - */ - private int minimumFnRemove(int[] h) { - int minIndex = Integer.MAX_VALUE; - double minValue = Double.MAX_VALUE; - - for(int i = 0; i < nbHash; i++) { - double keyWeight = getWeight(keyVector[h[i]]); - - if(keyWeight < minValue) { - minIndex = h[i]; - minValue = keyWeight; - } - - }//end for - i - - return minIndex; - }//end minimumFnRemove() - - /** - * Chooses the bit position that maximizes the number of false positive removed. - * @param h The different bit positions. - * @return int The position that maximizes the number of false positive removed. - */ - private int maximumFpRemove(int[] h){ - int maxIndex = Integer.MIN_VALUE; - double maxValue = Double.MIN_VALUE; - - for(int i = 0; i < nbHash; i++) { - double fpWeight = getWeight(fpVector[h[i]]); - - if(fpWeight > maxValue) { - maxValue = fpWeight; - maxIndex = h[i]; - } - } - - return maxIndex; - }//end maximumFpRemove() - - /** - * Chooses the bit position that minimizes the number of false negative generated while maximizing. - * the number of false positive removed. - * @param h The different bit positions. - * @return int The position that minimizes the number of false negative generated while maximizing. - */ - private int ratioRemove(int[] h){ - computeRatio(); - int minIndex = Integer.MAX_VALUE; - double minValue = Double.MAX_VALUE; - - for(int i = 0; i < nbHash; i++) { - if(ratio[h[i]] < minValue) { - minValue = ratio[h[i]]; - minIndex = h[i]; - } - }//end for - i - - return minIndex; - }//end ratioRemove() - - /** - * Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors. - * @param index The position of the bit to clear. - */ - private void clearBit(int index){ - if(index < 0 || index >= vectorSize) { - throw new ArrayIndexOutOfBoundsException(index); - } - - List kl = keyVector[index]; - List fpl = fpVector[index]; - - // update key list - int listSize = kl.size(); - for(int i = 0; i < listSize && !kl.isEmpty(); i++) { - removeKey(kl.get(0), keyVector); - } - - kl.clear(); - keyVector[index].clear(); - - //update false positive list - listSize = fpl.size(); - for(int i = 0; i < listSize && !fpl.isEmpty(); i++) { - removeKey(fpl.get(0), fpVector); - } - - fpl.clear(); - fpVector[index].clear(); - - //update ratio - ratio[index] = 0.0; - - //update bit vector - bits.clear(index); - }//end clearBit() - - /** - * Removes a given key from this filer. - * @param k The key to remove. - * @param vector The counting vector associated to the key. - */ - private void removeKey(Key k, List[] vector) { - if(k == null) { - throw new NullPointerException("Key can not be null"); - } - if(vector == null) { - throw new NullPointerException("ArrayList[] can not be null"); - } - - int[] h = hash.hash(k); - hash.clear(); - - for(int i = 0; i < nbHash; i++) { - vector[h[i]].remove(k); - } - }//end removeKey() - - /** - * Computes the ratio A/FP. - */ - private void computeRatio() { - for(int i = 0; i < vectorSize; i++) { - double keyWeight = getWeight(keyVector[i]); - double fpWeight = getWeight(fpVector[i]); - - if(keyWeight > 0 && fpWeight > 0) { - ratio[i] = keyWeight/fpWeight; - } - }//end for - i - }//end computeRatio() - - private double getWeight(List keyList) { - double weight = 0.0; - for(Key k: keyList) { - weight += k.getWeight(); - } - return weight; - } - - /** - * Creates and initialises the various vectors. - */ - @SuppressWarnings("unchecked") - private void createVector() { - fpVector = new List[vectorSize]; - keyVector = new List[vectorSize]; - ratio = new double[vectorSize]; - - for(int i = 0; i < vectorSize; i++) { - fpVector[i] = Collections.synchronizedList(new ArrayList()); - keyVector[i] = Collections.synchronizedList(new ArrayList()); - ratio[i] = 0.0; - }//end for -i - }//end createVector() - - // Writable - - @Override - public void write(DataOutput out) throws IOException { - super.write(out); - for(int i = 0; i < fpVector.length; i++) { - List list = fpVector[i]; - out.writeInt(list.size()); - for(Key k: list) { - k.write(out); - } - } - for(int i = 0; i < keyVector.length; i++) { - List list = keyVector[i]; - out.writeInt(list.size()); - for(Key k: list) { - k.write(out); - } - } - for(int i = 0; i < ratio.length; i++) { - out.writeDouble(ratio[i]); - } - } - - @Override - public void readFields(DataInput in) throws IOException { - super.readFields(in); - createVector(); - for(int i = 0; i < fpVector.length; i++) { - List list = fpVector[i]; - int size = in.readInt(); - for(int j = 0; j < size; j++) { - Key k = new Key(); - k.readFields(in); - list.add(k); - } - } - for(int i = 0; i < keyVector.length; i++) { - List list = keyVector[i]; - int size = in.readInt(); - for(int j = 0; j < size; j++) { - Key k = new Key(); - k.readFields(in); - list.add(k); - } - } - for(int i = 0; i < ratio.length; i++) { - ratio[i] = in.readDouble(); - } - } -}//end class diff --git a/src/test/org/onelab/test/StringKey.java b/src/test/org/onelab/test/StringKey.java deleted file mode 100644 index 256e674e913..00000000000 --- a/src/test/org/onelab/test/StringKey.java +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Copyright (c) 2005, European Commission project OneLab under contract 034819 - * (http://www.one-lab.org) - * - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.test; - -import java.io.UnsupportedEncodingException; -import org.apache.hadoop.hbase.HConstants; -import org.onelab.filter.Key; - -/** - * Test class for keys. - *

- * It gives an example on how to extend Key. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 5 Feb. 07 - * - * @see org.onelab.filter.Key A key stored in a filter - */ -public class StringKey extends Key { - - /** Default constructor - use with readFields */ - public StringKey() {} - - /** - * Construct a Key using the specified String and default weight - * - * @param key String key value - * @throws UnsupportedEncodingException - */ - public StringKey(String key) throws UnsupportedEncodingException { - super(key.getBytes(HConstants.UTF8_ENCODING)); - } - - /** - * Construct a Key using the specified string and weight - * - * @param key - String key value - * @param weight key weight - * @throws UnsupportedEncodingException - */ - public StringKey(String key, double weight) - throws UnsupportedEncodingException { - - super(key.getBytes(HConstants.UTF8_ENCODING), weight); - } - -} diff --git a/src/test/org/onelab/test/TestFilter.java b/src/test/org/onelab/test/TestFilter.java deleted file mode 100644 index eb04d1afd84..00000000000 --- a/src/test/org/onelab/test/TestFilter.java +++ /dev/null @@ -1,323 +0,0 @@ -/** - * Copyright (c) 2005, European Commission project OneLab under contract 034819 - * (http://www.one-lab.org) - * - * All rights reserved. - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the distribution. - * - Neither the name of the University Catholique de Louvain - UCL - * nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.onelab.test; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.UnsupportedEncodingException; -import junit.framework.TestCase; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hbase.util.Hash; -import org.onelab.filter.*; - -/** - * Test class. - * - * contract European Commission One-Lab Project 034819. - * - * @version 1.0 - 8 Feb. 07 - */ -public class TestFilter extends TestCase { - private static final Log LOG = LogFactory.getLog(TestFilter.class); - - /** Test a BloomFilter - * @throws UnsupportedEncodingException - * @throws IOException - */ - public void testBloomFilter() throws UnsupportedEncodingException, - IOException { - final StringKey[] inserted = { - new StringKey("wmjwjzyv"), - new StringKey("baietibz"), - new StringKey("guhsgxnv"), - new StringKey("mhnqycto"), - new StringKey("xcyqafgz"), - new StringKey("zidoamgb"), - new StringKey("tftfirzd"), - new StringKey("okapqlrg"), - new StringKey("yccwzwsq"), - new StringKey("qmonufqu"), - new StringKey("wlsctews"), - new StringKey("mksdhqri"), - new StringKey("wxxllokj"), - new StringKey("eviuqpls"), - new StringKey("bavotqmj"), - new StringKey("yibqzhdl"), - new StringKey("csfqmsyr"), - new StringKey("guxliyuh"), - new StringKey("pzicietj"), - new StringKey("qdwgrqwo"), - new StringKey("ujfzecmi"), - new StringKey("dzeqfvfi"), - new StringKey("phoegsij"), - new StringKey("bvudfcou"), - new StringKey("dowzmciz"), - new StringKey("etvhkizp"), - new StringKey("rzurqycg"), - new StringKey("krqfxuge"), - new StringKey("gflcohtd"), - new StringKey("fcrcxtps"), - new StringKey("qrtovxdq"), - new StringKey("aypxwrwi"), - new StringKey("dckpyznr"), - new StringKey("mdaawnpz"), - new StringKey("pakdfvca"), - new StringKey("xjglfbez"), - new StringKey("xdsecofi"), - new StringKey("sjlrfcab"), - new StringKey("ebcjawxv"), - new StringKey("hkafkjmy"), - new StringKey("oimmwaxo"), - new StringKey("qcuzrazo"), - new StringKey("nqydfkwk"), - new StringKey("frybvmlb"), - new StringKey("amxmaqws"), - new StringKey("gtkovkgx"), - new StringKey("vgwxrwss"), - new StringKey("xrhzmcep"), - new StringKey("tafwziil"), - new StringKey("erjmncnv"), - new StringKey("heyzqzrn"), - new StringKey("sowvyhtu"), - new StringKey("heeixgzy"), - new StringKey("ktcahcob"), - new StringKey("ljhbybgg"), - new StringKey("jiqfcksl"), - new StringKey("anjdkjhm"), - new StringKey("uzcgcuxp"), - new StringKey("vzdhjqla"), - new StringKey("svhgwwzq"), - new StringKey("zhswvhbp"), - new StringKey("ueceybwy"), - new StringKey("czkqykcw"), - new StringKey("ctisayir"), - new StringKey("hppbgciu"), - new StringKey("nhzgljfk"), - new StringKey("vaziqllf"), - new StringKey("narvrrij"), - new StringKey("kcevbbqi"), - new StringKey("qymuaqnp"), - new StringKey("pwqpfhsr"), - new StringKey("peyeicuk"), - new StringKey("kudlwihi"), - new StringKey("pkmqejlm"), - new StringKey("ylwzjftl"), - new StringKey("rhqrlqar"), - new StringKey("xmftvzsp"), - new StringKey("iaemtihk"), - new StringKey("ymsbrqcu"), - new StringKey("yfnlcxto"), - new StringKey("nluqopqh"), - new StringKey("wmrzhtox"), - new StringKey("qnffhqbl"), - new StringKey("zypqpnbw"), - new StringKey("oiokhatd"), - new StringKey("mdraddiu"), - new StringKey("zqoatltt"), - new StringKey("ewhulbtm"), - new StringKey("nmswpsdf"), - new StringKey("xsjeteqe"), - new StringKey("ufubcbma"), - new StringKey("phyxvrds"), - new StringKey("vhnfldap"), - new StringKey("zrrlycmg"), - new StringKey("becotcjx"), - new StringKey("wvbubokn"), - new StringKey("avkgiopr"), - new StringKey("mbqqxmrv"), - new StringKey("ibplgvuu"), - new StringKey("dghvpkgc") - }; - - final StringKey[] notInserted = { - new StringKey("abcdefgh"), - new StringKey("ijklmnop"), - new StringKey("qrstuvwx"), - new StringKey("yzabcdef") - }; - - /* - * Bloom filters are very sensitive to the number of elements inserted into - * them. - * - * If m denotes the number of bits in the Bloom filter (vectorSize), - * n denotes the number of elements inserted into the Bloom filter and - * k represents the number of hash functions used (nbHash), then - * according to Broder and Mitzenmacher, - * - * ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/BloomFilterSurvey.pdf ) - * - * the probability of false positives is minimized when k is - * approximately ln(2) * m/n. - * - * If we fix the number of hash functions and know the number of entries, - * then the optimal vector size m = (k * n) / ln(2) - */ - final int DEFAULT_NUMBER_OF_HASH_FUNCTIONS = 4; - BloomFilter bf = new BloomFilter( - (int) Math.ceil( - (DEFAULT_NUMBER_OF_HASH_FUNCTIONS * (1.0 * inserted.length)) / - Math.log(2.0)), - DEFAULT_NUMBER_OF_HASH_FUNCTIONS, - Hash.JENKINS_HASH - ); - - for (int i = 0; i < inserted.length; i++) { - bf.add(inserted[i]); - } - - // Verify that there are no false negatives and few (if any) false positives - - checkFalsePositivesNegatives(bf, inserted, notInserted); - - // Test serialization/deserialization - - LOG.info("Checking serialization/deserialization"); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - DataOutputStream out = new DataOutputStream(baos); - bf.write(out); - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - DataInputStream in = new DataInputStream(bais); - bf = new BloomFilter(); - bf.readFields(in); - - // Verify that there are no false negatives and few (if any) false positives - - checkFalsePositivesNegatives(bf, inserted, notInserted); - } - - private void checkFalsePositivesNegatives(BloomFilter bf, - StringKey[] inserted, StringKey[] notInserted) { - // Test membership for values we inserted. Should not get false negatives - - LOG.info("Checking for false negatives"); - for (int i = 0; i < inserted.length; i++) { - if (!bf.membershipTest(inserted[i])) { - LOG.error("false negative for: " + inserted[i]); - fail(); - } - } - - // Test membership for values we did not insert. It is possible to get - // false positives - - LOG.info("Checking for false positives"); - for (int i = 0; i < notInserted.length; i++) { - if(bf.membershipTest(notInserted[i])) { - LOG.error("false positive for: " + notInserted[i]); - fail(); - } - } - LOG.info("Success!"); - } - - /** Test a CountingBloomFilter - * @throws UnsupportedEncodingException - */ - public void testCountingBloomFilter() throws UnsupportedEncodingException { - Filter bf = new CountingBloomFilter(128, 2, Hash.JENKINS_HASH); - Key key = new StringKey("toto"); - Key k2 = new StringKey("lulu"); - Key k3 = new StringKey("mama"); - bf.add(key); - bf.add(k2); - bf.add(k3); - assertTrue(bf.membershipTest(key)); - assertFalse(bf.membershipTest(new StringKey("xyzzy"))); - assertFalse(bf.membershipTest(new StringKey("abcd"))); - - // delete 'key', and check that it is no longer a member - ((CountingBloomFilter)bf).delete(key); - assertFalse(bf.membershipTest(key)); - - // to test for overflows, add 'key' enough times to overflow a 4bit bucket, - // while asserting that it stays a member - for(int i = 0; i < 16; i++){ - bf.add(key); - assertTrue(bf.membershipTest(key)); - } - // test approximateCount - CountingBloomFilter bf3 = new CountingBloomFilter(4, 2, Hash.JENKINS_HASH); - // test the exact range - for (int i = 0; i < 8; i++) { - bf3.add(key); - bf3.add(k2); - assertEquals(bf3.approximateCount(key), i + 1); - assertEquals(bf3.approximateCount(k2), i + 1); - } - // test gently degraded counting in high-fill, high error rate filter - for (int i = 8; i < 15; i++) { - bf3.add(key); - assertTrue(bf3.approximateCount(key) >= (i + 1)); - assertEquals(bf3.approximateCount(k2), 8); - assertEquals(bf3.approximateCount(k3), 0); - } - } - - /** Test a DynamicBloomFilter - * @throws UnsupportedEncodingException - */ - public void testDynamicBloomFilter() throws UnsupportedEncodingException { - Filter bf = new DynamicBloomFilter(128, 2, Hash.JENKINS_HASH, 2); - Key key = new StringKey("toto"); - Key k2 = new StringKey("lulu"); - Key k3 = new StringKey("mama"); - bf.add(key); - bf.add(k2); - bf.add(k3); - assertTrue(bf.membershipTest(key)); - assertFalse(bf.membershipTest(new StringKey("xyzzy"))); - assertFalse(bf.membershipTest(new StringKey("abcd"))); - } -}//end class