move to use murmur3 for bloom filter

2012-07-19 10:53:54 -07:00 · 2012-07-19 10:53:54 -07:00 · cc2a6babda
parent bd219f3b34
commit cc2a6babda
3 changed files with 271 additions and 202 deletions
--- a/src/main/java/org/elasticsearch/common/MurmurHash.java
+++ b/src/main/java/org/elasticsearch/common/MurmurHash.java
@ -0,0 +1,264 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.common;
 /**
 * This is a very fast, non-cryptographic hash suitable for general hash-based
 * lookup. See http://murmurhash.googlepages.com/ for more details.
 * <p/>
 * hash32() and hash64() are MurmurHash 2.0.
 * hash3_x64_128() is MurmurHash 3.0.
 * <p/>
 * <p>
 * The C version of MurmurHash 2.0 found at that site was ported to Java by
 * Andrzej Bialecki (ab at getopt org).
 * </p>
 */
 public class MurmurHash {
    public static int hash32(byte[] data, int offset, int length, int seed) {
        int m = 0x5bd1e995;
        int r = 24;
        int h = seed ^ length;
        int len_4 = length >> 2;
        for (int i = 0; i < len_4; i++) {
            int i_4 = i << 2;
            int k = data[offset + i_4 + 3];
            k = k << 8;
            k = k | (data[offset + i_4 + 2] & 0xff);
            k = k << 8;
            k = k | (data[offset + i_4 + 1] & 0xff);
            k = k << 8;
            k = k | (data[offset + i_4 + 0] & 0xff);
            k *= m;
            k ^= k >>> r;
            k *= m;
            h *= m;
            h ^= k;
        }
        // avoid calculating modulo
        int len_m = len_4 << 2;
        int left = length - len_m;
        if (left != 0) {
            if (left >= 3) {
                h ^= (int) data[offset + length - 3] << 16;
            }
            if (left >= 2) {
                h ^= (int) data[offset + length - 2] << 8;
            }
            if (left >= 1) {
                h ^= (int) data[offset + length - 1];
            }
            h *= m;
        }
        h ^= h >>> 13;
        h *= m;
        h ^= h >>> 15;
        return h;
    }
    public static long hash2_64(byte[] key, int offset, int length, long seed) {
        long m64 = 0xc6a4a7935bd1e995L;
        int r64 = 47;
        long h64 = (seed & 0xffffffffL) ^ (m64 * length);
        int lenLongs = length >> 3;
        for (int i = 0; i < lenLongs; ++i) {
            int i_8 = i << 3;
            long k64 = ((long) key[offset + i_8 + 0] & 0xff) + (((long) key[offset + i_8 + 1] & 0xff) << 8) +
                    (((long) key[offset + i_8 + 2] & 0xff) << 16) + (((long) key[offset + i_8 + 3] & 0xff) << 24) +
                    (((long) key[offset + i_8 + 4] & 0xff) << 32) + (((long) key[offset + i_8 + 5] & 0xff) << 40) +
                    (((long) key[offset + i_8 + 6] & 0xff) << 48) + (((long) key[offset + i_8 + 7] & 0xff) << 56);
            k64 *= m64;
            k64 ^= k64 >>> r64;
            k64 *= m64;
            h64 ^= k64;
            h64 *= m64;
        }
        int rem = length & 0x7;
        switch (rem) {
            case 0:
                break;
            case 7:
                h64 ^= (long) key[offset + length - rem + 6] << 48;
            case 6:
                h64 ^= (long) key[offset + length - rem + 5] << 40;
            case 5:
                h64 ^= (long) key[offset + length - rem + 4] << 32;
            case 4:
                h64 ^= (long) key[offset + length - rem + 3] << 24;
            case 3:
                h64 ^= (long) key[offset + length - rem + 2] << 16;
            case 2:
                h64 ^= (long) key[offset + length - rem + 1] << 8;
            case 1:
                h64 ^= (long) key[offset + length - rem];
                h64 *= m64;
        }
        h64 ^= h64 >>> r64;
        h64 *= m64;
        h64 ^= h64 >>> r64;
        return h64;
    }
    protected static long getblock(byte[] key, int offset, int index) {
        int i_8 = index << 3;
        int blockOffset = offset + i_8;
        return ((long) key[blockOffset + 0] & 0xff) + (((long) key[blockOffset + 1] & 0xff) << 8) +
                (((long) key[blockOffset + 2] & 0xff) << 16) + (((long) key[blockOffset + 3] & 0xff) << 24) +
                (((long) key[blockOffset + 4] & 0xff) << 32) + (((long) key[blockOffset + 5] & 0xff) << 40) +
                (((long) key[blockOffset + 6] & 0xff) << 48) + (((long) key[blockOffset + 7] & 0xff) << 56);
    }
    protected static long rotl64(long v, int n) {
        return ((v << n) | (v >>> (64 - n)));
    }
    protected static long fmix(long k) {
        k ^= k >>> 33;
        k *= 0xff51afd7ed558ccdL;
        k ^= k >>> 33;
        k *= 0xc4ceb9fe1a85ec53L;
        k ^= k >>> 33;
        return k;
    }
    public static long[] hash3_x64_128(byte[] key, int offset, int length, long seed) {
        final int nblocks = length >> 4; // Process as 128-bit blocks.
        long h1 = seed;
        long h2 = seed;
        long c1 = 0x87c37b91114253d5L;
        long c2 = 0x4cf5ad432745937fL;
        //----------
        // body
        for (int i = 0; i < nblocks; i++) {
            long k1 = getblock(key, offset, i * 2 + 0);
            long k2 = getblock(key, offset, i * 2 + 1);
            k1 *= c1;
            k1 = rotl64(k1, 31);
            k1 *= c2;
            h1 ^= k1;
            h1 = rotl64(h1, 27);
            h1 += h2;
            h1 = h1 * 5 + 0x52dce729;
            k2 *= c2;
            k2 = rotl64(k2, 33);
            k2 *= c1;
            h2 ^= k2;
            h2 = rotl64(h2, 31);
            h2 += h1;
            h2 = h2 * 5 + 0x38495ab5;
        }
        //----------
        // tail
        // Advance offset to the unprocessed tail of the data.
        offset += nblocks * 16;
        long k1 = 0;
        long k2 = 0;
        switch (length & 15) {
            case 15:
                k2 ^= ((long) key[offset + 14]) << 48;
            case 14:
                k2 ^= ((long) key[offset + 13]) << 40;
            case 13:
                k2 ^= ((long) key[offset + 12]) << 32;
            case 12:
                k2 ^= ((long) key[offset + 11]) << 24;
            case 11:
                k2 ^= ((long) key[offset + 10]) << 16;
            case 10:
                k2 ^= ((long) key[offset + 9]) << 8;
            case 9:
                k2 ^= ((long) key[offset + 8]) << 0;
                k2 *= c2;
                k2 = rotl64(k2, 33);
                k2 *= c1;
                h2 ^= k2;
            case 8:
                k1 ^= ((long) key[offset + 7]) << 56;
            case 7:
                k1 ^= ((long) key[offset + 6]) << 48;
            case 6:
                k1 ^= ((long) key[offset + 5]) << 40;
            case 5:
                k1 ^= ((long) key[offset + 4]) << 32;
            case 4:
                k1 ^= ((long) key[offset + 3]) << 24;
            case 3:
                k1 ^= ((long) key[offset + 2]) << 16;
            case 2:
                k1 ^= ((long) key[offset + 1]) << 8;
            case 1:
                k1 ^= ((long) key[offset]);
                k1 *= c1;
                k1 = rotl64(k1, 31);
                k1 *= c2;
                h1 ^= k1;
        }
        ;
        //----------
        // finalization
        h1 ^= length;
        h2 ^= length;
        h1 += h2;
        h2 += h1;
        h1 = fmix(h1);
        h2 = fmix(h2);
        h1 += h2;
        h2 += h1;
        return (new long[]{h1, h2});
    }
 }
--- a/src/main/java/org/elasticsearch/common/bloom/MurmurHash.java
+++ b/src/main/java/org/elasticsearch/common/bloom/MurmurHash.java
@ -1,188 +0,0 @@
 /*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.elasticsearch.common.bloom;
 import java.nio.ByteBuffer;
 /**
 * This is a very fast, non-cryptographic hash suitable for general hash-based
 * lookup. See http://murmurhash.googlepages.com/ for more details.
 * <p/>
 * <p>
 * The C version of MurmurHash 2.0 found at that site was ported to Java by
 * Andrzej Bialecki (ab at getopt org).
 * </p>
 */
 public class MurmurHash {
    public static int hash32(ByteBuffer data, int offset, int length, int seed) {
        int m = 0x5bd1e995;
        int r = 24;
        int h = seed ^ length;
        int len_4 = length >> 2;
        for (int i = 0; i < len_4; i++) {
            int i_4 = i << 2;
            int k = data.get(offset + i_4 + 3);
            k = k << 8;
            k = k | (data.get(offset + i_4 + 2) & 0xff);
            k = k << 8;
            k = k | (data.get(offset + i_4 + 1) & 0xff);
            k = k << 8;
            k = k | (data.get(offset + i_4 + 0) & 0xff);
            k *= m;
            k ^= k >>> r;
            k *= m;
            h *= m;
            h ^= k;
        }
        // avoid calculating modulo
        int len_m = len_4 << 2;
        int left = length - len_m;
        if (left != 0) {
            if (left >= 3) {
                h ^= (int) data.get(offset + length - 3) << 16;
            }
            if (left >= 2) {
                h ^= (int) data.get(offset + length - 2) << 8;
            }
            if (left >= 1) {
                h ^= (int) data.get(offset + length - 1);
            }
            h *= m;
        }
        h ^= h >>> 13;
        h *= m;
        h ^= h >>> 15;
        return h;
    }
    public static long hash64(ByteBuffer key, int offset, int length, long seed) {
        long m64 = 0xc6a4a7935bd1e995L;
        int r64 = 47;
        long h64 = (seed & 0xffffffffL) ^ (m64 * length);
        int lenLongs = length >> 3;
        for (int i = 0; i < lenLongs; ++i) {
            int i_8 = i << 3;
            long k64 = ((long) key.get(offset + i_8 + 0) & 0xff) + (((long) key.get(offset + i_8 + 1) & 0xff) << 8) +
                    (((long) key.get(offset + i_8 + 2) & 0xff) << 16) + (((long) key.get(offset + i_8 + 3) & 0xff) << 24) +
                    (((long) key.get(offset + i_8 + 4) & 0xff) << 32) + (((long) key.get(offset + i_8 + 5) & 0xff) << 40) +
                    (((long) key.get(offset + i_8 + 6) & 0xff) << 48) + (((long) key.get(offset + i_8 + 7) & 0xff) << 56);
            k64 *= m64;
            k64 ^= k64 >>> r64;
            k64 *= m64;
            h64 ^= k64;
            h64 *= m64;
        }
        int rem = length & 0x7;
        switch (rem) {
            case 0:
                break;
            case 7:
                h64 ^= (long) key.get(offset + length - rem + 6) << 48;
            case 6:
                h64 ^= (long) key.get(offset + length - rem + 5) << 40;
            case 5:
                h64 ^= (long) key.get(offset + length - rem + 4) << 32;
            case 4:
                h64 ^= (long) key.get(offset + length - rem + 3) << 24;
            case 3:
                h64 ^= (long) key.get(offset + length - rem + 2) << 16;
            case 2:
                h64 ^= (long) key.get(offset + length - rem + 1) << 8;
            case 1:
                h64 ^= (long) key.get(offset + length - rem);
                h64 *= m64;
        }
        h64 ^= h64 >>> r64;
        h64 *= m64;
        h64 ^= h64 >>> r64;
        return h64;
    }
    public static long hash64(byte[] key, int offset, int length, long seed) {
        long m64 = 0xc6a4a7935bd1e995L;
        int r64 = 47;
        long h64 = (seed & 0xffffffffL) ^ (m64 * length);
        int lenLongs = length >> 3;
        for (int i = 0; i < lenLongs; ++i) {
            int i_8 = i << 3;
            long k64 = ((long) key[offset + i_8 + 0] & 0xff) + (((long) key[offset + i_8 + 1] & 0xff) << 8) +
                    (((long) key[offset + i_8 + 2] & 0xff) << 16) + (((long) key[offset + i_8 + 3] & 0xff) << 24) +
                    (((long) key[offset + i_8 + 4] & 0xff) << 32) + (((long) key[offset + i_8 + 5] & 0xff) << 40) +
                    (((long) key[offset + i_8 + 6] & 0xff) << 48) + (((long) key[offset + i_8 + 7] & 0xff) << 56);
            k64 *= m64;
            k64 ^= k64 >>> r64;
            k64 *= m64;
            h64 ^= k64;
            h64 *= m64;
        }
        int rem = length & 0x7;
        switch (rem) {
            case 0:
                break;
            case 7:
                h64 ^= (long) key[offset + length - rem + 6] << 48;
            case 6:
                h64 ^= (long) key[offset + length - rem + 5] << 40;
            case 5:
                h64 ^= (long) key[offset + length - rem + 4] << 32;
            case 4:
                h64 ^= (long) key[offset + length - rem + 3] << 24;
            case 3:
                h64 ^= (long) key[offset + length - rem + 2] << 16;
            case 2:
                h64 ^= (long) key[offset + length - rem + 1] << 8;
            case 1:
                h64 ^= (long) key[offset + length - rem];
                h64 *= m64;
        }
        h64 ^= h64 >>> r64;
        h64 *= m64;
        h64 ^= h64 >>> r64;
        return h64;
    }
 }
--- a/src/main/java/org/elasticsearch/common/bloom/ObsBloomFilter.java
+++ b/src/main/java/org/elasticsearch/common/bloom/ObsBloomFilter.java
@ -20,6 +20,7 @@
 package org.elasticsearch.common.bloom;
 import org.apache.lucene.util.OpenBitSet;
 import org.elasticsearch.common.MurmurHash;
 import org.elasticsearch.common.RamUsage;
 public class ObsBloomFilter implements BloomFilter {
@ -53,17 +54,11 @@ public class ObsBloomFilter implements BloomFilter {
        return getHashBuckets(key, offset, length, hashCount, buckets());
    }
    // Murmur is faster than an SHA-based approach and provides as-good collision
    // resistance.  The combinatorial generation approach described in
    // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
    // does prove to work in actual tests, and is obviously faster
    // than performing further iterations of murmur.
    static long[] getHashBuckets(byte[] b, int offset, int length, int hashCount, long max) {
        long[] result = new long[hashCount];
-        long hash1 = MurmurHash.hash64(b, offset, length, 0L);
+        long[] hash = MurmurHash.hash3_x64_128(b, offset, length, 0L);
        long hash2 = MurmurHash.hash64(b, offset, length, hash1);
        for (int i = 0; i < hashCount; ++i) {
-            result[i] = Math.abs((hash1 + (long) i * hash2) % max);
+            result[i] = Math.abs((hash[0] + (long) i * hash[1]) % max);
        }
        return result;
    }
@ -71,10 +66,9 @@ public class ObsBloomFilter implements BloomFilter {
    @Override
    public void add(byte[] key, int offset, int length) {
        // inline the hash buckets so we don't have to create the int[] each time...
-        long hash1 = MurmurHash.hash64(key, offset, length, 0L);
+        long[] hash = MurmurHash.hash3_x64_128(key, offset, length, 0L);
        long hash2 = MurmurHash.hash64(key, offset, length, hash1);
        for (int i = 0; i < hashCount; ++i) {
-            long bucketIndex = Math.abs((hash1 + (long) i * hash2) % size);
+            long bucketIndex = Math.abs((hash[0] + (long) i * hash[1]) % size);
            bitset.fastSet(bucketIndex);
        }
    }
@ -82,10 +76,9 @@ public class ObsBloomFilter implements BloomFilter {
    @Override
    public boolean isPresent(byte[] key, int offset, int length) {
        // inline the hash buckets so we don't have to create the int[] each time...
-        long hash1 = MurmurHash.hash64(key, offset, length, 0L);
+        long[] hash = MurmurHash.hash3_x64_128(key, offset, length, 0L);
        long hash2 = MurmurHash.hash64(key, offset, length, hash1);
        for (int i = 0; i < hashCount; ++i) {
-            long bucketIndex = Math.abs((hash1 + (long) i * hash2) % size);
+            long bucketIndex = Math.abs((hash[0] + (long) i * hash[1]) % size);
            if (!bitset.fastGet(bucketIndex)) {
                return false;
            }