Try using Murmurhash 3 for bloom filters (#12868)

This commit is contained in:
Shubham Chaudhary 2024-10-14 18:38:18 +05:30 committed by GitHub
parent e94ef1f315
commit 286b59a2db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 165 additions and 139 deletions

View File

@ -54,8 +54,9 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
*
* <p>A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter settings on a
* per-field basis. The default configuration is {@link DefaultBloomFilterFactory} which allocates a
* ~8mb bitset and hashes values using {@link MurmurHash64}. This should be suitable for most
* purposes.
* ~8mb bitset and hashes values using {@link
* org.apache.lucene.util.StringHelper#murmurhash3_x64_128(BytesRef)}. This should be suitable for
* most purposes.
*
* <p>The format of the blm file is as follows:
*

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
/**
* A class used to represent a set of many, potentially large, values (e.g. many long strings such
@ -53,7 +54,6 @@ public class FuzzySet implements Accountable {
NO
};
private HashFunction hashFunction;
private FixedBitSet filter;
private int bloomSize;
private final int hashCount;
@ -138,7 +138,6 @@ public class FuzzySet implements Accountable {
super();
this.filter = filter;
this.bloomSize = bloomSize;
this.hashFunction = MurmurHash64.INSTANCE;
this.hashCount = hashCount;
}
@ -150,11 +149,12 @@ public class FuzzySet implements Accountable {
* @return NO or MAYBE
*/
public ContainsResult contains(BytesRef value) {
long hash = hashFunction.hash(value);
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
long[] hash = StringHelper.murmurhash3_x64_128(value);
long msb = hash[0];
long lsb = hash[1];
for (int i = 0; i < hashCount; i++) {
int bloomPos = (lsb + i * msb);
int bloomPos = ((int) (lsb + i * msb)) & bloomSize;
if (!mayContainValue(bloomPos)) {
return ContainsResult.NO;
}
@ -216,15 +216,14 @@ public class FuzzySet implements Accountable {
* is modulo n'd where n is the chosen size of the internal bitset.
*
* @param value the key value to be hashed
* @throws IOException If there is a low-level I/O error
*/
public void addValue(BytesRef value) throws IOException {
long hash = hashFunction.hash(value);
int msb = (int) (hash >>> Integer.SIZE);
int lsb = (int) hash;
public void addValue(BytesRef value) {
long[] hash = StringHelper.murmurhash3_x64_128(value);
long msb = hash[0];
long lsb = hash[1];
for (int i = 0; i < hashCount; i++) {
// Bitmasking using bloomSize is effectively a modulo operation.
int bloomPos = (lsb + i * msb) & bloomSize;
int bloomPos = ((int) (lsb + i * msb)) & bloomSize;
filter.set(bloomPos);
}
}
@ -302,9 +301,7 @@ public class FuzzySet implements Accountable {
@Override
public String toString() {
return getClass().getSimpleName()
+ "(hash="
+ hashFunction
+ ", k="
+ "(k="
+ hashCount
+ ", bits="
+ filter.cardinality()

View File

@ -1,37 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.bloom;
import org.apache.lucene.util.BytesRef;
/**
* Base class for hashing functions that can be referred to by name. Subclasses are expected to
* provide threadsafe implementations of the hash function on the range of bytes referenced in the
* provided {@link BytesRef}
*
* @lucene.experimental
*/
public abstract class HashFunction {
/**
* Hashes the contents of the referenced bytes
*
* @param bytes the data to be hashed
* @return the hash of the bytes referenced by bytes.offset and length bytes.length
*/
public abstract long hash(BytesRef bytes);
}

View File

@ -1,85 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.bloom;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
/**
* This is a very fast, non-cryptographic hash suitable for general hash-based lookup. See
* http://murmurhash.googlepages.com/ for more details.
*
* <p>The code from Apache Commons was adapted in the form here to work with BytesRefs with offsets
* and lengths rather than raw byte arrays.
*/
public class MurmurHash64 extends HashFunction {
private static final long M64 = 0xc6a4a7935bd1e995L;
private static final int R64 = 47;
public static final HashFunction INSTANCE = new MurmurHash64();
/**
* Generates a 64-bit hash from byte array of the given length and seed.
*
* @param data The input byte array
* @param seed The initial seed value
* @param length The length of the array
* @return The 64-bit hash of the given array
*/
public static long hash64(byte[] data, int seed, int offset, int length) {
long h = (seed & 0xffffffffL) ^ (length * M64);
final int nblocks = length >> 3;
// body
for (int i = 0; i < nblocks; i++) {
long k = (long) BitUtil.VH_LE_LONG.get(data, offset);
k *= M64;
k ^= k >>> R64;
k *= M64;
h ^= k;
h *= M64;
offset += Long.BYTES;
}
int remaining = length & 0x07;
if (0 < remaining) {
for (int i = 0; i < remaining; i++) {
h ^= ((long) data[offset + i] & 0xff) << (Byte.SIZE * i);
}
h *= M64;
}
h ^= h >>> R64;
h *= M64;
h ^= h >>> R64;
return h;
}
@Override
public final long hash(BytesRef br) {
return hash64(br.bytes, 0xe17a1465, br.offset, br.length);
}
@Override
public String toString() {
return getClass().getSimpleName();
}
}

View File

@ -209,6 +209,156 @@ public abstract class StringHelper {
return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed);
}
/**
* Generates 128-bit hash from the byte array with the given offset, length and seed.
*
* <p>The code is adopted from Apache Commons (<a
* href="https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html">link</a>)
*
* @param data The input byte array
* @param offset The first element of array
* @param length The length of array
* @param seed The initial seed value
* @return The 128-bit hash (2 longs)
*/
public static long[] murmurhash3_x64_128(
final byte[] data, final int offset, final int length, final int seed) {
// Use an unsigned 32-bit integer as the seed
return murmurhash3_x64_128(data, offset, length, seed & 0xFFFFFFFFL);
}
@SuppressWarnings("fallthrough")
private static long[] murmurhash3_x64_128(
final byte[] data, final int offset, final int length, final long seed) {
long h1 = seed;
long h2 = seed;
final int nblocks = length >> 4;
// Constants for 128-bit variant
final long C1 = 0x87c37b91114253d5L;
final long C2 = 0x4cf5ad432745937fL;
final int R1 = 31;
final int R2 = 27;
final int R3 = 33;
final int M = 5;
final int N1 = 0x52dce729;
final int N2 = 0x38495ab5;
// body
for (int i = 0; i < nblocks; i++) {
final int index = offset + (i << 4);
long k1 = (long) BitUtil.VH_LE_LONG.get(data, index);
long k2 = (long) BitUtil.VH_LE_LONG.get(data, index + 8);
// mix functions for k1
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
h1 = Long.rotateLeft(h1, R2);
h1 += h2;
h1 = h1 * M + N1;
// mix functions for k2
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
h2 = Long.rotateLeft(h2, R1);
h2 += h1;
h2 = h2 * M + N2;
}
// tail
long k1 = 0;
long k2 = 0;
final int index = offset + (nblocks << 4);
switch (length & 0x0F) {
case 15:
k2 ^= ((long) data[index + 14] & 0xff) << 48;
case 14:
k2 ^= ((long) data[index + 13] & 0xff) << 40;
case 13:
k2 ^= ((long) data[index + 12] & 0xff) << 32;
case 12:
k2 ^= ((long) data[index + 11] & 0xff) << 24;
case 11:
k2 ^= ((long) data[index + 10] & 0xff) << 16;
case 10:
k2 ^= ((long) data[index + 9] & 0xff) << 8;
case 9:
k2 ^= data[index + 8] & 0xff;
k2 *= C2;
k2 = Long.rotateLeft(k2, R3);
k2 *= C1;
h2 ^= k2;
case 8:
k1 ^= ((long) data[index + 7] & 0xff) << 56;
case 7:
k1 ^= ((long) data[index + 6] & 0xff) << 48;
case 6:
k1 ^= ((long) data[index + 5] & 0xff) << 40;
case 5:
k1 ^= ((long) data[index + 4] & 0xff) << 32;
case 4:
k1 ^= ((long) data[index + 3] & 0xff) << 24;
case 3:
k1 ^= ((long) data[index + 2] & 0xff) << 16;
case 2:
k1 ^= ((long) data[index + 1] & 0xff) << 8;
case 1:
k1 ^= data[index] & 0xff;
k1 *= C1;
k1 = Long.rotateLeft(k1, R1);
k1 *= C2;
h1 ^= k1;
}
// finalization
h1 ^= length;
h2 ^= length;
h1 += h2;
h2 += h1;
h1 = fmix64(h1);
h2 = fmix64(h2);
h1 += h2;
h2 += h1;
return new long[] {h1, h2};
}
/**
* Performs the final avalanche mix step of the 64-bit hash function.
*
* @param hash The current hash
* @return The final hash
*/
private static long fmix64(long hash) {
hash ^= (hash >>> 33);
hash *= 0xff51afd7ed558ccdL;
hash ^= (hash >>> 33);
hash *= 0xc4ceb9fe1a85ec53L;
hash ^= (hash >>> 33);
return hash;
}
/**
* Generates 128-bit hash from the byte array with the given offset, length and seed.
*
* <p>The code is adopted from Apache Commons (<a
* href="https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html">link</a>)
*
* @param data The input data
* @return The 128-bit hash (2 longs)
*/
public static long[] murmurhash3_x64_128(BytesRef data) {
return murmurhash3_x64_128(data.bytes, data.offset, data.length, 104729);
}
// Holds 128 bit unsigned value:
private static BigInteger nextId;
private static final BigInteger mask128;