diff --git a/CHANGES.txt b/CHANGES.txt
index 8fa9889dfcc..cdb0310bb13 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -22,6 +22,7 @@ Trunk (unreleased changes)
HADOOP-2401 Add convenience put method that takes writable
(Johan Oskarsson via Stack)
HADOOP-2088 Make hbase runnable in $HADOOP_HOME/build(/contrib/hbase)
+ HADOOP-2126 Use Bob Jenkins' hash for bloom filters
Branch 0.15 (unreleased changes)
diff --git a/src/java/org/apache/hadoop/hbase/HRegionInfo.java b/src/java/org/apache/hadoop/hbase/HRegionInfo.java
index 282e3c1f70d..2a33a00f282 100644
--- a/src/java/org/apache/hadoop/hbase/HRegionInfo.java
+++ b/src/java/org/apache/hadoop/hbase/HRegionInfo.java
@@ -23,46 +23,24 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.hbase.util.JenkinsHash;
+
/**
* HRegion information.
* Contains HRegion id, start and end keys, a reference to this
* HRegions' table descriptor, etc.
*/
public class HRegionInfo implements WritableComparable {
- private static MessageDigest encoder = null;
-
- static {
- try {
- if (encoder == null) {
- encoder = MessageDigest.getInstance("SHA");
- }
- } catch (NoSuchAlgorithmException e) {
- e.printStackTrace();
- }
- }
-
/**
* @param regionName
* @return the encodedName
*/
public static String encodeRegionName(final Text regionName) {
- byte[] bytes = null;
- synchronized (encoder) {
- encoder.update(regionName.getBytes(), 0, regionName.getLength());
- bytes = encoder.digest();
- encoder.reset();
- }
- StringBuilder sb = new StringBuilder();
- for (int i = 0; i < bytes.length; i++) {
- sb.append(bytes[i]);
- }
- return sb.toString();
+ return String.valueOf(
+ JenkinsHash.hash(regionName.getBytes(), regionName.getLength(), 0));
}
/** delimiter used between portions of a region name */
diff --git a/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java b/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java
new file mode 100644
index 00000000000..29ac73ff7ed
--- /dev/null
+++ b/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java
@@ -0,0 +1,234 @@
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.util;
+
+/**
+ * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ * lookup3.c
+ *
+ * You can use this free for any purpose. It's in the public domain.
+ * It has no warranty.
+ *
+ * Produces 32-bit hash for hash table lookup.
+ */
+public class JenkinsHash {
+ private static long INT_MASK = 0x00000000ffffffffL;
+ private static long BYTE_MASK = 0x00000000000000ffL;
+
+ private static long rot(long val, int pos) {
+ return Long.valueOf(Integer.rotateLeft(
+ Long.valueOf(val & INT_MASK).intValue(), pos)).longValue() & INT_MASK;
+ }
+
+ /**
+ * Alternate form for hashing an entire byte array
+ *
+ * @param bytes
+ * @param initval
+ * @return hash value
+ */
+ public static int hash(byte[] bytes, int initval) {
+ return hash(bytes, bytes.length, initval);
+ }
+
+ /**
+ * taken from hashlittle() -- hash a variable-length key into a 32-bit value
+ *
+ * @param key the key (the unaligned variable-length array of bytes)
+ * @param nbytes number of bytes to include in hash
+ * @param initval can be any integer value
+ * @return a 32-bit value. Every bit of the key affects every bit of the
+ * return value. Two keys differing by one or two bits will have totally
+ * different hash values.
+ *
+ * The best hash table sizes are powers of 2. There is no need to do mod a
+ * prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask.
+ * For example, if you need only 10 bits, do h = (h & hashmask(10));
+ * In which case, the hash table should have hashsize(10) elements.
+ *
+ * If you are hashing n strings byte[][] k, do it like this:
+ * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h);
+ *
+ * By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+ * code any way you wish, private, educational, or commercial. It's free.
+ *
+ * Use for hash table lookup, or anything where one collision in 2^^32 is
+ * acceptable. Do NOT use for cryptographic purposes.
+ */
+ public static int hash(byte[] key, int nbytes, int initval) {
+ int length = nbytes;
+ long a, b, c; // We use longs because we don't have unsigned ints
+ a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK;
+ int offset = 0;
+ for (; length > 12; offset += 12, length -= 12) {
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+
+ /*
+ * mix -- mix 3 32-bit values reversibly.
+ * This is reversible, so any information in (a,b,c) before mix() is
+ * still in (a,b,c) after mix().
+ *
+ * If four pairs of (a,b,c) inputs are run through mix(), or through
+ * mix() in reverse, there are at least 32 bits of the output that
+ * are sometimes the same for one pair and different for another pair.
+ *
+ * This was tested for:
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+ * satisfy this are
+ * 4 6 8 16 19 4
+ * 9 15 3 18 27 15
+ * 14 9 3 7 17 3
+ * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for
+ * "differ" defined as + with a one-bit base and a two-bit delta. I
+ * used http://burtleburtle.net/bob/hash/avalanche.html to choose
+ * the operations, constants, and arrangements of the variables.
+ *
+ * This does not achieve avalanche. There are input bits of (a,b,c)
+ * that fail to affect some output bits of (a,b,c), especially of a.
+ * The most thoroughly mixed value is c, but it doesn't really even
+ * achieve avalanche in c.
+ *
+ * This allows some parallelism. Read-after-writes are good at doubling
+ * the number of bits affected, so the goal of mixing pulls in the
+ * opposite direction as the goal of parallelism. I did what I could.
+ * Rotates seem to cost as much as shifts on every machine I could lay
+ * my hands on, and rotates are much kinder to the top and bottom bits,
+ * so I used rotates.
+ *
+ * #define mix(a,b,c) \
+ * { \
+ * a -= c; a ^= rot(c, 4); c += b; \
+ * b -= a; b ^= rot(a, 6); a += c; \
+ * c -= b; c ^= rot(b, 8); b += a; \
+ * a -= c; a ^= rot(c,16); c += b; \
+ * b -= a; b ^= rot(a,19); a += c; \
+ * c -= b; c ^= rot(b, 4); b += a; \
+ * }
+ *
+ * mix(a,b,c);
+ */
+ a = (a - c) & INT_MASK; a ^= rot(c, 4); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a, 6); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 8); b = (b + a) & INT_MASK;
+ a = (a - c) & INT_MASK; a ^= rot(c,16); c = (c + b) & INT_MASK;
+ b = (b - a) & INT_MASK; b ^= rot(a,19); a = (a + c) & INT_MASK;
+ c = (c - b) & INT_MASK; c ^= rot(b, 4); b = (b + a) & INT_MASK;
+ }
+
+ //-------------------------------- last block: affect all 32 bits of (c)
+ switch (length) { // all the case statements fall through
+ case 12:
+ c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 11:
+ c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 10:
+ c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 9:
+ c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK;
+ case 8:
+ b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 7:
+ b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 6:
+ b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 5:
+ b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK;
+ case 4:
+ a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK;
+ case 3:
+ a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK;
+ case 2:
+ a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK;
+ case 1:
+ a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK;
+ break;
+ case 0:
+ return Long.valueOf(c & INT_MASK).intValue();
+ }
+ /*
+ * final -- final mixing of 3 32-bit values (a,b,c) into c
+ *
+ * Pairs of (a,b,c) values differing in only a few bits will usually
+ * produce values of c that look totally different. This was tested for
+ * - pairs that differed by one bit, by two bits, in any combination
+ * of top bits of (a,b,c), or in any combination of bottom bits of
+ * (a,b,c).
+ *
+ * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ * is commonly produced by subtraction) look like a single 1-bit
+ * difference.
+ *
+ * - the base values were pseudorandom, all zero but one bit set, or
+ * all zero plus a counter that starts at zero.
+ *
+ * These constants passed:
+ * 14 11 25 16 4 14 24
+ * 12 14 25 16 4 14 24
+ * and these came close:
+ * 4 8 15 26 3 22 24
+ * 10 8 15 26 3 22 24
+ * 11 8 15 26 3 22 24
+ *
+ * #define final(a,b,c) \
+ * {
+ * c ^= b; c -= rot(b,14); \
+ * a ^= c; a -= rot(c,11); \
+ * b ^= a; b -= rot(a,25); \
+ * c ^= b; c -= rot(b,16); \
+ * a ^= c; a -= rot(c,4); \
+ * b ^= a; b -= rot(a,14); \
+ * c ^= b; c -= rot(b,24); \
+ * }
+ *
+ */
+ c ^= b; c = (c - rot(b,14)) & INT_MASK;
+ a ^= c; a = (a - rot(c,11)) & INT_MASK;
+ b ^= a; b = (b - rot(a,25)) & INT_MASK;
+ c ^= b; c = (c - rot(b,16)) & INT_MASK;
+ a ^= c; a = (a - rot(c,4)) & INT_MASK;
+ b ^= a; b = (b - rot(a,14)) & INT_MASK;
+ c ^= b; c = (c - rot(b,24)) & INT_MASK;
+
+ return Long.valueOf(c & INT_MASK).intValue();
+ }
+}
diff --git a/src/java/org/onelab/filter/HashFunction.java b/src/java/org/onelab/filter/HashFunction.java
index 12d4dc4256c..a89f9061ab3 100644
--- a/src/java/org/onelab/filter/HashFunction.java
+++ b/src/java/org/onelab/filter/HashFunction.java
@@ -49,7 +49,7 @@
*/
package org.onelab.filter;
-import java.security.*;
+import org.apache.hadoop.hbase.util.JenkinsHash;
/**
* Implements a hash object that returns a certain number of hashed values.
@@ -66,9 +66,6 @@ import java.security.*;
* @see SHA-1 algorithm
*/
public final class HashFunction{
- /** The SHA-1 algorithm. */
- private MessageDigest sha;
-
/** The number of hashed values. */
private int nbHash;
@@ -83,13 +80,6 @@ public final class HashFunction{
* @param nbHash The number of resulting hashed values.
*/
public HashFunction(int maxValue, int nbHash) {
- try {
- sha = MessageDigest.getInstance("SHA-1");
-
- } catch(NoSuchAlgorithmException e) {
- throw new AssertionError(e);
- }
-
if(maxValue <= 0) {
throw new IllegalArgumentException("maxValue must be > 0");
}
@@ -102,9 +92,8 @@ public final class HashFunction{
this.nbHash = nbHash;
}//end constructor
- /** Clears this hash function. */
+ /** Clears this hash function. A NOOP */
public void clear(){
- sha.reset();
}//end clear()
/**
@@ -121,19 +110,9 @@ public final class HashFunction{
if(b.length == 0) {
throw new IllegalArgumentException("key length must be > 0");
}
- sha.update(b);
- byte[] digestBytes = sha.digest();
int[] result = new int[nbHash];
- int nbBytePerInt = digestBytes.length/nbHash;
- int offset = 0;
- for(int i = 0; i < nbHash; i++){
- int val = 0;
- for(int j = offset; j < offset + nbBytePerInt; j++) {
- val |=
- (digestBytes[offset] & 0xff) << ((nbBytePerInt - 1 - (j - offset)) * 8);
- }
- result[i] = Math.abs(val) % maxValue;
- offset += nbBytePerInt;
+ for (int i = 0, initval = 0; i < nbHash; i++) {
+ result[i] = Math.abs(JenkinsHash.hash(b, initval)) % maxValue;
}
return result;
}//end hash()
diff --git a/src/test/org/onelab/test/TestFilter.java b/src/test/org/onelab/test/TestFilter.java
index 26980e3af1c..98c43265dd2 100644
--- a/src/test/org/onelab/test/TestFilter.java
+++ b/src/test/org/onelab/test/TestFilter.java
@@ -75,8 +75,8 @@ public class TestFilter extends TestCase {
bf.add(k3);
assertTrue(bf.membershipTest(key));
assertFalse(bf.membershipTest(new StringKey("graknyl")));
- assertTrue(bf.membershipTest(new StringKey("xyzzy"))); // False positive
- assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive
+ assertFalse(bf.membershipTest(new StringKey("xyzzy")));
+ assertFalse(bf.membershipTest(new StringKey("abcd")));
}
/** Test a CountingBloomFilter
@@ -92,8 +92,8 @@ public class TestFilter extends TestCase {
bf.add(k3);
assertTrue(bf.membershipTest(key));
assertFalse(bf.membershipTest(new StringKey("graknyl")));
- assertTrue(bf.membershipTest(new StringKey("xyzzy"))); // False positive
- assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive
+ assertFalse(bf.membershipTest(new StringKey("xyzzy")));
+ assertFalse(bf.membershipTest(new StringKey("abcd")));
}
/** Test a DynamicBloomFilter
@@ -110,6 +110,6 @@ public class TestFilter extends TestCase {
assertTrue(bf.membershipTest(key));
assertFalse(bf.membershipTest(new StringKey("graknyl")));
assertFalse(bf.membershipTest(new StringKey("xyzzy")));
- assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive
+ assertFalse(bf.membershipTest(new StringKey("abcd")));
}
}//end class