From 0c8412cb79a7269ed34100760431cafcaee5f9db Mon Sep 17 00:00:00 2001 From: Jim Kellerman Date: Thu, 1 Nov 2007 01:48:46 +0000 Subject: [PATCH] HADOOP-2126 Use Bob Jenkins' hash for bloom filters git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@590875 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../org/apache/hadoop/hbase/HRegionInfo.java | 30 +-- .../apache/hadoop/hbase/util/JenkinsHash.java | 234 ++++++++++++++++++ src/java/org/onelab/filter/HashFunction.java | 29 +-- src/test/org/onelab/test/TestFilter.java | 10 +- 5 files changed, 248 insertions(+), 56 deletions(-) create mode 100644 src/java/org/apache/hadoop/hbase/util/JenkinsHash.java diff --git a/CHANGES.txt b/CHANGES.txt index 8fa9889dfcc..cdb0310bb13 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -22,6 +22,7 @@ Trunk (unreleased changes) HADOOP-2401 Add convenience put method that takes writable (Johan Oskarsson via Stack) HADOOP-2088 Make hbase runnable in $HADOOP_HOME/build(/contrib/hbase) + HADOOP-2126 Use Bob Jenkins' hash for bloom filters Branch 0.15 (unreleased changes) diff --git a/src/java/org/apache/hadoop/hbase/HRegionInfo.java b/src/java/org/apache/hadoop/hbase/HRegionInfo.java index 282e3c1f70d..2a33a00f282 100644 --- a/src/java/org/apache/hadoop/hbase/HRegionInfo.java +++ b/src/java/org/apache/hadoop/hbase/HRegionInfo.java @@ -23,46 +23,24 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; - import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.hbase.util.JenkinsHash; + /** * HRegion information. * Contains HRegion id, start and end keys, a reference to this * HRegions' table descriptor, etc. */ public class HRegionInfo implements WritableComparable { - private static MessageDigest encoder = null; - - static { - try { - if (encoder == null) { - encoder = MessageDigest.getInstance("SHA"); - } - } catch (NoSuchAlgorithmException e) { - e.printStackTrace(); - } - } - /** * @param regionName * @return the encodedName */ public static String encodeRegionName(final Text regionName) { - byte[] bytes = null; - synchronized (encoder) { - encoder.update(regionName.getBytes(), 0, regionName.getLength()); - bytes = encoder.digest(); - encoder.reset(); - } - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < bytes.length; i++) { - sb.append(bytes[i]); - } - return sb.toString(); + return String.valueOf( + JenkinsHash.hash(regionName.getBytes(), regionName.getLength(), 0)); } /** delimiter used between portions of a region name */ diff --git a/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java b/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java new file mode 100644 index 00000000000..29ac73ff7ed --- /dev/null +++ b/src/java/org/apache/hadoop/hbase/util/JenkinsHash.java @@ -0,0 +1,234 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hbase.util; + +/** + * lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * lookup3.c + * + * You can use this free for any purpose. It's in the public domain. + * It has no warranty. + * + * Produces 32-bit hash for hash table lookup. + */ +public class JenkinsHash { + private static long INT_MASK = 0x00000000ffffffffL; + private static long BYTE_MASK = 0x00000000000000ffL; + + private static long rot(long val, int pos) { + return Long.valueOf(Integer.rotateLeft( + Long.valueOf(val & INT_MASK).intValue(), pos)).longValue() & INT_MASK; + } + + /** + * Alternate form for hashing an entire byte array + * + * @param bytes + * @param initval + * @return hash value + */ + public static int hash(byte[] bytes, int initval) { + return hash(bytes, bytes.length, initval); + } + + /** + * taken from hashlittle() -- hash a variable-length key into a 32-bit value + * + * @param key the key (the unaligned variable-length array of bytes) + * @param nbytes number of bytes to include in hash + * @param initval can be any integer value + * @return a 32-bit value. Every bit of the key affects every bit of the + * return value. Two keys differing by one or two bits will have totally + * different hash values. + * + * The best hash table sizes are powers of 2. There is no need to do mod a + * prime (mod is sooo slow!). If you need less than 32 bits, use a bitmask. + * For example, if you need only 10 bits, do h = (h & hashmask(10)); + * In which case, the hash table should have hashsize(10) elements. + * + * If you are hashing n strings byte[][] k, do it like this: + * for (int i = 0, h = 0; i < n; ++i) h = hash( k[i], h); + * + * By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this + * code any way you wish, private, educational, or commercial. It's free. + * + * Use for hash table lookup, or anything where one collision in 2^^32 is + * acceptable. Do NOT use for cryptographic purposes. + */ + public static int hash(byte[] key, int nbytes, int initval) { + int length = nbytes; + long a, b, c; // We use longs because we don't have unsigned ints + a = b = c = (0x00000000deadbeefL + length + initval) & INT_MASK; + int offset = 0; + for (; length > 12; offset += 12, length -= 12) { + a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK; + a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK; + b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK; + c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + + /* + * mix -- mix 3 32-bit values reversibly. + * This is reversible, so any information in (a,b,c) before mix() is + * still in (a,b,c) after mix(). + * + * If four pairs of (a,b,c) inputs are run through mix(), or through + * mix() in reverse, there are at least 32 bits of the output that + * are sometimes the same for one pair and different for another pair. + * + * This was tested for: + * - pairs that differed by one bit, by two bits, in any combination + * of top bits of (a,b,c), or in any combination of bottom bits of + * (a,b,c). + * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + * is commonly produced by subtraction) look like a single 1-bit + * difference. + * - the base values were pseudorandom, all zero but one bit set, or + * all zero plus a counter that starts at zero. + * + * Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that + * satisfy this are + * 4 6 8 16 19 4 + * 9 15 3 18 27 15 + * 14 9 3 7 17 3 + * Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing for + * "differ" defined as + with a one-bit base and a two-bit delta. I + * used http://burtleburtle.net/bob/hash/avalanche.html to choose + * the operations, constants, and arrangements of the variables. + * + * This does not achieve avalanche. There are input bits of (a,b,c) + * that fail to affect some output bits of (a,b,c), especially of a. + * The most thoroughly mixed value is c, but it doesn't really even + * achieve avalanche in c. + * + * This allows some parallelism. Read-after-writes are good at doubling + * the number of bits affected, so the goal of mixing pulls in the + * opposite direction as the goal of parallelism. I did what I could. + * Rotates seem to cost as much as shifts on every machine I could lay + * my hands on, and rotates are much kinder to the top and bottom bits, + * so I used rotates. + * + * #define mix(a,b,c) \ + * { \ + * a -= c; a ^= rot(c, 4); c += b; \ + * b -= a; b ^= rot(a, 6); a += c; \ + * c -= b; c ^= rot(b, 8); b += a; \ + * a -= c; a ^= rot(c,16); c += b; \ + * b -= a; b ^= rot(a,19); a += c; \ + * c -= b; c ^= rot(b, 4); b += a; \ + * } + * + * mix(a,b,c); + */ + a = (a - c) & INT_MASK; a ^= rot(c, 4); c = (c + b) & INT_MASK; + b = (b - a) & INT_MASK; b ^= rot(a, 6); a = (a + c) & INT_MASK; + c = (c - b) & INT_MASK; c ^= rot(b, 8); b = (b + a) & INT_MASK; + a = (a - c) & INT_MASK; a ^= rot(c,16); c = (c + b) & INT_MASK; + b = (b - a) & INT_MASK; b ^= rot(a,19); a = (a + c) & INT_MASK; + c = (c - b) & INT_MASK; c ^= rot(b, 4); b = (b + a) & INT_MASK; + } + + //-------------------------------- last block: affect all 32 bits of (c) + switch (length) { // all the case statements fall through + case 12: + c = (c + (((key[offset + 11] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 11: + c = (c + (((key[offset + 10] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 10: + c = (c + (((key[offset + 9] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 9: + c = (c + (key[offset + 8] & BYTE_MASK)) & INT_MASK; + case 8: + b = (b + (((key[offset + 7] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 7: + b = (b + (((key[offset + 6] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 6: + b = (b + (((key[offset + 5] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 5: + b = (b + (key[offset + 4] & BYTE_MASK)) & INT_MASK; + case 4: + a = (a + (((key[offset + 3] & BYTE_MASK) << 24) & INT_MASK)) & INT_MASK; + case 3: + a = (a + (((key[offset + 2] & BYTE_MASK) << 16) & INT_MASK)) & INT_MASK; + case 2: + a = (a + (((key[offset + 1] & BYTE_MASK) << 8) & INT_MASK)) & INT_MASK; + case 1: + a = (a + (key[offset + 0] & BYTE_MASK)) & INT_MASK; + break; + case 0: + return Long.valueOf(c & INT_MASK).intValue(); + } + /* + * final -- final mixing of 3 32-bit values (a,b,c) into c + * + * Pairs of (a,b,c) values differing in only a few bits will usually + * produce values of c that look totally different. This was tested for + * - pairs that differed by one bit, by two bits, in any combination + * of top bits of (a,b,c), or in any combination of bottom bits of + * (a,b,c). + * + * - "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + * is commonly produced by subtraction) look like a single 1-bit + * difference. + * + * - the base values were pseudorandom, all zero but one bit set, or + * all zero plus a counter that starts at zero. + * + * These constants passed: + * 14 11 25 16 4 14 24 + * 12 14 25 16 4 14 24 + * and these came close: + * 4 8 15 26 3 22 24 + * 10 8 15 26 3 22 24 + * 11 8 15 26 3 22 24 + * + * #define final(a,b,c) \ + * { + * c ^= b; c -= rot(b,14); \ + * a ^= c; a -= rot(c,11); \ + * b ^= a; b -= rot(a,25); \ + * c ^= b; c -= rot(b,16); \ + * a ^= c; a -= rot(c,4); \ + * b ^= a; b -= rot(a,14); \ + * c ^= b; c -= rot(b,24); \ + * } + * + */ + c ^= b; c = (c - rot(b,14)) & INT_MASK; + a ^= c; a = (a - rot(c,11)) & INT_MASK; + b ^= a; b = (b - rot(a,25)) & INT_MASK; + c ^= b; c = (c - rot(b,16)) & INT_MASK; + a ^= c; a = (a - rot(c,4)) & INT_MASK; + b ^= a; b = (b - rot(a,14)) & INT_MASK; + c ^= b; c = (c - rot(b,24)) & INT_MASK; + + return Long.valueOf(c & INT_MASK).intValue(); + } +} diff --git a/src/java/org/onelab/filter/HashFunction.java b/src/java/org/onelab/filter/HashFunction.java index 12d4dc4256c..a89f9061ab3 100644 --- a/src/java/org/onelab/filter/HashFunction.java +++ b/src/java/org/onelab/filter/HashFunction.java @@ -49,7 +49,7 @@ */ package org.onelab.filter; -import java.security.*; +import org.apache.hadoop.hbase.util.JenkinsHash; /** * Implements a hash object that returns a certain number of hashed values. @@ -66,9 +66,6 @@ import java.security.*; * @see SHA-1 algorithm */ public final class HashFunction{ - /** The SHA-1 algorithm. */ - private MessageDigest sha; - /** The number of hashed values. */ private int nbHash; @@ -83,13 +80,6 @@ public final class HashFunction{ * @param nbHash The number of resulting hashed values. */ public HashFunction(int maxValue, int nbHash) { - try { - sha = MessageDigest.getInstance("SHA-1"); - - } catch(NoSuchAlgorithmException e) { - throw new AssertionError(e); - } - if(maxValue <= 0) { throw new IllegalArgumentException("maxValue must be > 0"); } @@ -102,9 +92,8 @@ public final class HashFunction{ this.nbHash = nbHash; }//end constructor - /** Clears this hash function. */ + /** Clears this hash function. A NOOP */ public void clear(){ - sha.reset(); }//end clear() /** @@ -121,19 +110,9 @@ public final class HashFunction{ if(b.length == 0) { throw new IllegalArgumentException("key length must be > 0"); } - sha.update(b); - byte[] digestBytes = sha.digest(); int[] result = new int[nbHash]; - int nbBytePerInt = digestBytes.length/nbHash; - int offset = 0; - for(int i = 0; i < nbHash; i++){ - int val = 0; - for(int j = offset; j < offset + nbBytePerInt; j++) { - val |= - (digestBytes[offset] & 0xff) << ((nbBytePerInt - 1 - (j - offset)) * 8); - } - result[i] = Math.abs(val) % maxValue; - offset += nbBytePerInt; + for (int i = 0, initval = 0; i < nbHash; i++) { + result[i] = Math.abs(JenkinsHash.hash(b, initval)) % maxValue; } return result; }//end hash() diff --git a/src/test/org/onelab/test/TestFilter.java b/src/test/org/onelab/test/TestFilter.java index 26980e3af1c..98c43265dd2 100644 --- a/src/test/org/onelab/test/TestFilter.java +++ b/src/test/org/onelab/test/TestFilter.java @@ -75,8 +75,8 @@ public class TestFilter extends TestCase { bf.add(k3); assertTrue(bf.membershipTest(key)); assertFalse(bf.membershipTest(new StringKey("graknyl"))); - assertTrue(bf.membershipTest(new StringKey("xyzzy"))); // False positive - assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive + assertFalse(bf.membershipTest(new StringKey("xyzzy"))); + assertFalse(bf.membershipTest(new StringKey("abcd"))); } /** Test a CountingBloomFilter @@ -92,8 +92,8 @@ public class TestFilter extends TestCase { bf.add(k3); assertTrue(bf.membershipTest(key)); assertFalse(bf.membershipTest(new StringKey("graknyl"))); - assertTrue(bf.membershipTest(new StringKey("xyzzy"))); // False positive - assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive + assertFalse(bf.membershipTest(new StringKey("xyzzy"))); + assertFalse(bf.membershipTest(new StringKey("abcd"))); } /** Test a DynamicBloomFilter @@ -110,6 +110,6 @@ public class TestFilter extends TestCase { assertTrue(bf.membershipTest(key)); assertFalse(bf.membershipTest(new StringKey("graknyl"))); assertFalse(bf.membershipTest(new StringKey("xyzzy"))); - assertTrue(bf.membershipTest(new StringKey("abcd"))); // False positive + assertFalse(bf.membershipTest(new StringKey("abcd"))); } }//end class