mirror of https://github.com/apache/lucene.git
Try using Murmurhash 3 for bloom filters (#12868)
This commit is contained in:
parent
e94ef1f315
commit
286b59a2db
|
@ -54,8 +54,9 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
|
|||
*
|
||||
* <p>A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a
|
||||
* per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a
|
||||
* ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most
|
||||
* purposes.
|
||||
* ~8mb bitset and hashes values using {@link
|
||||
* org.apache.lucene.util.StringHelper#murmurhash3_x64_128(BytesRef)}. This should be suitable for
|
||||
* most purposes.
|
||||
*
|
||||
* <p>The format of the blm file is as follows:
|
||||
*
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.util.Accountable;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
||||
/**
|
||||
* A class used to represent a set of many, potentially large, values (e.g. many long strings such
|
||||
|
@ -53,7 +54,6 @@ public class FuzzySet implements Accountable {
|
|||
NO
|
||||
};
|
||||
|
||||
private HashFunction hashFunction;
|
||||
private FixedBitSet filter;
|
||||
private int bloomSize;
|
||||
private final int hashCount;
|
||||
|
@ -138,7 +138,6 @@ public class FuzzySet implements Accountable {
|
|||
super();
|
||||
this.filter = filter;
|
||||
this.bloomSize = bloomSize;
|
||||
this.hashFunction = MurmurHash64.INSTANCE;
|
||||
this.hashCount = hashCount;
|
||||
}
|
||||
|
||||
|
@ -150,11 +149,12 @@ public class FuzzySet implements Accountable {
|
|||
* @return NO or MAYBE
|
||||
*/
|
||||
public ContainsResult contains(BytesRef value) {
|
||||
long hash = hashFunction.hash(value);
|
||||
int msb = (int) (hash >>> Integer.SIZE);
|
||||
int lsb = (int) hash;
|
||||
long[] hash = StringHelper.murmurhash3_x64_128(value);
|
||||
|
||||
long msb = hash[0];
|
||||
long lsb = hash[1];
|
||||
for (int i = 0; i < hashCount; i++) {
|
||||
int bloomPos = (lsb + i * msb);
|
||||
int bloomPos = ((int) (lsb + i * msb)) & bloomSize;
|
||||
if (!mayContainValue(bloomPos)) {
|
||||
return ContainsResult.NO;
|
||||
}
|
||||
|
@ -216,15 +216,14 @@ public class FuzzySet implements Accountable {
|
|||
* is modulo n'd where n is the chosen size of the internal bitset.
|
||||
*
|
||||
* @param value the key value to be hashed
|
||||
* @throws IOException If there is a low-level I/O error
|
||||
*/
|
||||
public void addValue(BytesRef value) throws IOException {
|
||||
long hash = hashFunction.hash(value);
|
||||
int msb = (int) (hash >>> Integer.SIZE);
|
||||
int lsb = (int) hash;
|
||||
public void addValue(BytesRef value) {
|
||||
long[] hash = StringHelper.murmurhash3_x64_128(value);
|
||||
long msb = hash[0];
|
||||
long lsb = hash[1];
|
||||
for (int i = 0; i < hashCount; i++) {
|
||||
// Bitmasking using bloomSize is effectively a modulo operation.
|
||||
int bloomPos = (lsb + i * msb) & bloomSize;
|
||||
int bloomPos = ((int) (lsb + i * msb)) & bloomSize;
|
||||
filter.set(bloomPos);
|
||||
}
|
||||
}
|
||||
|
@ -302,9 +301,7 @@ public class FuzzySet implements Accountable {
|
|||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName()
|
||||
+ "(hash="
|
||||
+ hashFunction
|
||||
+ ", k="
|
||||
+ "(k="
|
||||
+ hashCount
|
||||
+ ", bits="
|
||||
+ filter.cardinality()
|
||||
|
|
|
@ -1,37 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* Base class for hashing functions that can be referred to by name. Subclasses are expected to
|
||||
* provide threadsafe implementations of the hash function on the range of bytes referenced in the
|
||||
* provided {@link BytesRef}
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class HashFunction {
|
||||
|
||||
/**
|
||||
* Hashes the contents of the referenced bytes
|
||||
*
|
||||
* @param bytes the data to be hashed
|
||||
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
|
||||
*/
|
||||
public abstract long hash(BytesRef bytes);
|
||||
}
|
|
@ -1,85 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.codecs.bloom;
|
||||
|
||||
import org.apache.lucene.util.BitUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
|
||||
* http://murmurhash.googlepages.com/ for more details.
|
||||
*
|
||||
* <p>The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets
|
||||
* and lengths rather than raw byte arrays.
|
||||
*/
|
||||
public class MurmurHash64 extends HashFunction {
|
||||
private static final long M64 = 0xc6a4a7935bd1e995L;
|
||||
private static final int R64 = 47;
|
||||
public static final HashFunction INSTANCE = new MurmurHash64();
|
||||
|
||||
/**
|
||||
* Generates a 64-bit hash from byte array of the given length and seed.
|
||||
*
|
||||
* @param data The input byte array
|
||||
* @param seed The initial seed value
|
||||
* @param length The length of the array
|
||||
* @return The 64-bit hash of the given array
|
||||
*/
|
||||
public static long hash64(byte[] data, int seed, int offset, int length) {
|
||||
long h = (seed & 0xffffffffL) ^ (length * M64);
|
||||
|
||||
final int nblocks = length >> 3;
|
||||
|
||||
// body
|
||||
for (int i = 0; i < nblocks; i++) {
|
||||
|
||||
long k = (long) BitUtil.VH_LE_LONG.get(data, offset);
|
||||
k *= M64;
|
||||
k ^= k >>> R64;
|
||||
k *= M64;
|
||||
|
||||
h ^= k;
|
||||
h *= M64;
|
||||
|
||||
offset += Long.BYTES;
|
||||
}
|
||||
|
||||
int remaining = length & 0x07;
|
||||
if (0 < remaining) {
|
||||
for (int i = 0; i < remaining; i++) {
|
||||
h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i);
|
||||
}
|
||||
h *= M64;
|
||||
}
|
||||
|
||||
h ^= h >>> R64;
|
||||
h *= M64;
|
||||
h ^= h >>> R64;
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final long hash(BytesRef br) {
|
||||
return hash64(br.bytes, 0xe17a1465, br.offset, br.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName();
|
||||
}
|
||||
}
|
|
@ -209,6 +209,156 @@ public abstract class StringHelper {
|
|||
return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates 128-bit hash from the byte array with the given offset, length and seed.
|
||||
*
|
||||
* <p>The code is adopted from Apache Commons (<a
|
||||
* href="https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html">link</a>)
|
||||
*
|
||||
* @param data The input byte array
|
||||
* @param offset The first element of array
|
||||
* @param length The length of array
|
||||
* @param seed The initial seed value
|
||||
* @return The 128-bit hash (2 longs)
|
||||
*/
|
||||
public static long[] murmurhash3_x64_128(
|
||||
final byte[] data, final int offset, final int length, final int seed) {
|
||||
// Use an unsigned 32-bit integer as the seed
|
||||
return murmurhash3_x64_128(data, offset, length, seed & 0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static long[] murmurhash3_x64_128(
|
||||
final byte[] data, final int offset, final int length, final long seed) {
|
||||
long h1 = seed;
|
||||
long h2 = seed;
|
||||
final int nblocks = length >> 4;
|
||||
|
||||
// Constants for 128-bit variant
|
||||
final long C1 = 0x87c37b91114253d5L;
|
||||
final long C2 = 0x4cf5ad432745937fL;
|
||||
final int R1 = 31;
|
||||
final int R2 = 27;
|
||||
final int R3 = 33;
|
||||
final int M = 5;
|
||||
final int N1 = 0x52dce729;
|
||||
final int N2 = 0x38495ab5;
|
||||
|
||||
// body
|
||||
for (int i = 0; i < nblocks; i++) {
|
||||
final int index = offset + (i << 4);
|
||||
long k1 = (long) BitUtil.VH_LE_LONG.get(data, index);
|
||||
long k2 = (long) BitUtil.VH_LE_LONG.get(data, index + 8);
|
||||
|
||||
// mix functions for k1
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
h1 = Long.rotateLeft(h1, R2);
|
||||
h1 += h2;
|
||||
h1 = h1 * M + N1;
|
||||
|
||||
// mix functions for k2
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
h2 = Long.rotateLeft(h2, R1);
|
||||
h2 += h1;
|
||||
h2 = h2 * M + N2;
|
||||
}
|
||||
|
||||
// tail
|
||||
long k1 = 0;
|
||||
long k2 = 0;
|
||||
final int index = offset + (nblocks << 4);
|
||||
switch (length & 0x0F) {
|
||||
case 15:
|
||||
k2 ^= ((long) data[index + 14] & 0xff) << 48;
|
||||
case 14:
|
||||
k2 ^= ((long) data[index + 13] & 0xff) << 40;
|
||||
case 13:
|
||||
k2 ^= ((long) data[index + 12] & 0xff) << 32;
|
||||
case 12:
|
||||
k2 ^= ((long) data[index + 11] & 0xff) << 24;
|
||||
case 11:
|
||||
k2 ^= ((long) data[index + 10] & 0xff) << 16;
|
||||
case 10:
|
||||
k2 ^= ((long) data[index + 9] & 0xff) << 8;
|
||||
case 9:
|
||||
k2 ^= data[index + 8] & 0xff;
|
||||
k2 *= C2;
|
||||
k2 = Long.rotateLeft(k2, R3);
|
||||
k2 *= C1;
|
||||
h2 ^= k2;
|
||||
|
||||
case 8:
|
||||
k1 ^= ((long) data[index + 7] & 0xff) << 56;
|
||||
case 7:
|
||||
k1 ^= ((long) data[index + 6] & 0xff) << 48;
|
||||
case 6:
|
||||
k1 ^= ((long) data[index + 5] & 0xff) << 40;
|
||||
case 5:
|
||||
k1 ^= ((long) data[index + 4] & 0xff) << 32;
|
||||
case 4:
|
||||
k1 ^= ((long) data[index + 3] & 0xff) << 24;
|
||||
case 3:
|
||||
k1 ^= ((long) data[index + 2] & 0xff) << 16;
|
||||
case 2:
|
||||
k1 ^= ((long) data[index + 1] & 0xff) << 8;
|
||||
case 1:
|
||||
k1 ^= data[index] & 0xff;
|
||||
k1 *= C1;
|
||||
k1 = Long.rotateLeft(k1, R1);
|
||||
k1 *= C2;
|
||||
h1 ^= k1;
|
||||
}
|
||||
|
||||
// finalization
|
||||
h1 ^= length;
|
||||
h2 ^= length;
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
h1 = fmix64(h1);
|
||||
h2 = fmix64(h2);
|
||||
|
||||
h1 += h2;
|
||||
h2 += h1;
|
||||
|
||||
return new long[] {h1, h2};
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs the final avalanche mix step of the 64-bit hash function.
|
||||
*
|
||||
* @param hash The current hash
|
||||
* @return The final hash
|
||||
*/
|
||||
private static long fmix64(long hash) {
|
||||
hash ^= (hash >>> 33);
|
||||
hash *= 0xff51afd7ed558ccdL;
|
||||
hash ^= (hash >>> 33);
|
||||
hash *= 0xc4ceb9fe1a85ec53L;
|
||||
hash ^= (hash >>> 33);
|
||||
return hash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates 128-bit hash from the byte array with the given offset, length and seed.
|
||||
*
|
||||
* <p>The code is adopted from Apache Commons (<a
|
||||
* href="https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html">link</a>)
|
||||
*
|
||||
* @param data The input data
|
||||
* @return The 128-bit hash (2 longs)
|
||||
*/
|
||||
public static long[] murmurhash3_x64_128(BytesRef data) {
|
||||
return murmurhash3_x64_128(data.bytes, data.offset, data.length, 104729);
|
||||
}
|
||||
|
||||
// Holds 128 bit unsigned value:
|
||||
private static BigInteger nextId;
|
||||
private static final BigInteger mask128;
|
||||
|
|
Loading…
Reference in New Issue