diff --git a/src/main/java/org/elasticsearch/common/lucene/uid/UidField.java b/src/main/java/org/elasticsearch/common/lucene/uid/UidField.java
index 40ec2107ab6..6c29f160597 100644
--- a/src/main/java/org/elasticsearch/common/lucene/uid/UidField.java
+++ b/src/main/java/org/elasticsearch/common/lucene/uid/UidField.java
@@ -24,13 +24,12 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Field;
-import org.apache.lucene.index.AtomicReaderContext;
-import org.apache.lucene.index.DocsAndPositionsEnum;
-import org.apache.lucene.index.Term;
+import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Numbers;
import org.elasticsearch.common.lucene.Lucene;
+import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import java.io.IOException;
@@ -56,11 +55,27 @@ public class UidField extends Field {
// this works fine for nested docs since they don't have the payload which has the version
// so we iterate till we find the one with the payload
- // LUCENE 4 UPGRADE: We can get rid of the do while loop, since there is only one _uid value (live docs are taken into account)
public static DocIdAndVersion loadDocIdAndVersion(AtomicReaderContext context, Term term) {
int docId = Lucene.NO_DOC;
try {
- DocsAndPositionsEnum uid = context.reader().termPositionsEnum(term);
+ Terms terms = context.reader().terms(term.field());
+ if (terms == null) {
+ return null;
+ }
+ // hack to break early if we have a bloom filter...
+ if (terms instanceof BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms) {
+ if (!((BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms) terms).getFilter().mightContain(term.bytes())) {
+ return null;
+ }
+ }
+ TermsEnum termsEnum = terms.iterator(null);
+ if (termsEnum == null) {
+ return null;
+ }
+ if (!termsEnum.seekExact(term.bytes(), true)) {
+ return null;
+ }
+ DocsAndPositionsEnum uid = termsEnum.docsAndPositions(context.reader().getLiveDocs(), null, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (uid == null || uid.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
return null; // no doc
}
@@ -89,10 +104,26 @@ public class UidField extends Field {
* Load the version for the uid from the reader, returning -1 if no doc exists, or -2 if
* no version is available (for backward comp.)
*/
- // LUCENE 4 UPGRADE: We can get rid of the do while loop, since there is only one _uid value (live docs are taken into account)
public static long loadVersion(AtomicReaderContext context, Term term) {
try {
- DocsAndPositionsEnum uid = context.reader().termPositionsEnum(term);
+ Terms terms = context.reader().terms(term.field());
+ if (terms == null) {
+ return -1;
+ }
+ // hack to break early if we have a bloom filter...
+ if (terms instanceof BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms) {
+ if (!((BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms) terms).getFilter().mightContain(term.bytes())) {
+ return -1;
+ }
+ }
+ TermsEnum termsEnum = terms.iterator(null);
+ if (termsEnum == null) {
+ return -1;
+ }
+ if (!termsEnum.seekExact(term.bytes(), true)) {
+ return -1;
+ }
+ DocsAndPositionsEnum uid = termsEnum.docsAndPositions(context.reader().getLiveDocs(), null, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (uid == null || uid.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
return -1;
}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilter.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilter.java
new file mode 100644
index 00000000000..b471b97e1a9
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilter.java
@@ -0,0 +1,503 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.codec.postingsformat;
+
+import com.google.common.math.LongMath;
+import com.google.common.primitives.Ints;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.unit.SizeValue;
+
+import java.io.IOException;
+import java.math.RoundingMode;
+import java.util.Arrays;
+import java.util.Comparator;
+
+/**
+ * A bloom filter. Inspired by Guava bloom filter implementation though with some optimizations.
+ */
+public class BloomFilter {
+
+ /**
+ * A factory that can use different fpp based on size.
+ */
+ public static class Factory {
+
+ public static final Factory DEFAULT = buildDefault();
+
+ private static Factory buildDefault() {
+ // Some numbers:
+ // 10k =0.001: 140.4kb , 10 Hashes
+ // 10k =0.01 : 93.6kb , 6 Hashes
+ // 100k=0.01 : 936.0kb , 6 Hashes
+ // 100k=0.03 : 712.7kb , 5 Hashes
+ // 500k=0.01 : 4.5mb , 6 Hashes
+ // 500k=0.03 : 3.4mb , 5 Hashes
+ // 500k=0.05 : 2.9mb , 4 Hashes
+ // 1m=0.01 : 9.1mb , 6 Hashes
+ // 1m=0.03 : 6.9mb , 5 Hashes
+ // 1m=0.05 : 5.9mb , 4 Hashes
+ // 5m=0.01 : 45.7mb , 6 Hashes
+ // 5m=0.03 : 34.8mb , 5 Hashes
+ // 5m=0.05 : 29.7mb , 4 Hashes
+ // 50m=0.01 : 457.0mb , 6 Hashes
+ // 50m=0.03 : 297.3mb , 4 Hashes
+ // 50m=0.10 : 228.5mb , 3 Hashes
+ return buildFromString("10k=0.01,1m=0.03");
+ }
+
+ /**
+ * Supports just passing fpp, as in "0.01", and also ranges, like "50k=0.01,1m=0.05". If
+ * its null, returns {@link #buildDefault()}.
+ */
+ public static Factory buildFromString(@Nullable String config) {
+ if (config == null) {
+ return buildDefault();
+ }
+ String[] sEntries = Strings.splitStringToArray(config, ',');
+ if (sEntries.length == 0) {
+ if (config.length() > 0) {
+ return new Factory(new Entry[]{new Entry(0, Double.parseDouble(config))});
+ }
+ return buildDefault();
+ }
+ Entry[] entries = new Entry[sEntries.length];
+ for (int i = 0; i < sEntries.length; i++) {
+ int index = sEntries[i].indexOf('=');
+ entries[i] = new Entry(
+ (int) SizeValue.parseSizeValue(sEntries[i].substring(0, index).trim()).singles(),
+ Double.parseDouble(sEntries[i].substring(index + 1).trim())
+ );
+ }
+ return new Factory(entries);
+ }
+
+ private final Entry[] entries;
+
+ public Factory(Entry[] entries) {
+ this.entries = entries;
+ // the order is from the upper most expected insertions to the lowest
+ Arrays.sort(this.entries, new Comparator() {
+ @Override
+ public int compare(Entry o1, Entry o2) {
+ return o2.expectedInsertions - o1.expectedInsertions;
+ }
+ });
+ }
+
+ public BloomFilter createFilter(int expectedInsertions) {
+ for (Entry entry : entries) {
+ if (expectedInsertions > entry.expectedInsertions) {
+ return BloomFilter.create(expectedInsertions, entry.fpp);
+ }
+ }
+ return BloomFilter.create(expectedInsertions, 0.03);
+ }
+
+ public static class Entry {
+ public final int expectedInsertions;
+ public final double fpp;
+
+ Entry(int expectedInsertions, double fpp) {
+ this.expectedInsertions = expectedInsertions;
+ this.fpp = fpp;
+ }
+ }
+ }
+
+ /**
+ * Creates a bloom filter based on the with the expected number
+ * of insertions and expected false positive probability.
+ *
+ * @param expectedInsertions the number of expected insertions to the constructed
+ * @param fpp the desired false positive probability (must be positive and less than 1.0)
+ */
+ public static BloomFilter create(int expectedInsertions, double fpp) {
+ if (expectedInsertions == 0) {
+ expectedInsertions = 1;
+ }
+ /*
+ * TODO(user): Put a warning in the javadoc about tiny fpp values,
+ * since the resulting size is proportional to -log(p), but there is not
+ * much of a point after all, e.g. optimalM(1000, 0.0000000000000001) = 76680
+ * which is less that 10kb. Who cares!
+ */
+ long numBits = optimalNumOfBits(expectedInsertions, fpp);
+ int numHashFunctions = optimalNumOfHashFunctions(expectedInsertions, numBits);
+ try {
+ return new BloomFilter(new BitArray(numBits), numHashFunctions);
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException("Could not create BloomFilter of " + numBits + " bits", e);
+ }
+ }
+
+ public static BloomFilter deserialize(DataInput in) throws IOException {
+ int version = in.readInt(); // we do nothing with this now..., defaults to 0
+
+ int numLongs = in.readInt();
+ long[] data = new long[numLongs];
+ for (int i = 0; i < numLongs; i++) {
+ data[i] = in.readLong();
+ }
+
+ int numberOfHashFunctions = in.readInt();
+
+ int hashType = in.readInt(); // again, nothing to do now...
+
+ return new BloomFilter(new BitArray(data), numberOfHashFunctions);
+ }
+
+ public static void serilaize(BloomFilter filter, DataOutput out) throws IOException {
+ out.writeInt(0); // version
+
+ BitArray bits = filter.bits;
+ out.writeInt(bits.data.length);
+ for (long l : bits.data) {
+ out.writeLong(l);
+ }
+
+ out.writeInt(filter.numHashFunctions);
+
+ out.writeInt(0); // hashType
+ }
+
+ /**
+ * The bit set of the BloomFilter (not necessarily power of 2!)
+ */
+ final BitArray bits;
+ /**
+ * Number of hashes per element
+ */
+ final int numHashFunctions;
+
+ BloomFilter(BitArray bits, int numHashFunctions) {
+ this.bits = bits;
+ this.numHashFunctions = numHashFunctions;
+ /*
+ * This only exists to forbid BFs that cannot use the compact persistent representation.
+ * If it ever throws, at a user who was not intending to use that representation, we should
+ * reconsider
+ */
+ if (numHashFunctions > 255) {
+ throw new IllegalArgumentException("Currently we don't allow BloomFilters that would use more than 255 hash functions");
+ }
+ }
+
+ public boolean put(BytesRef value) {
+ long hash64 = hash3_x64_128(value.bytes, value.offset, value.length, 0);
+ int hash1 = (int) hash64;
+ int hash2 = (int) (hash64 >>> 32);
+ boolean bitsChanged = false;
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int nextHash = hash1 + i * hash2;
+ if (nextHash < 0) {
+ nextHash = ~nextHash;
+ }
+ bitsChanged |= bits.set(nextHash % bits.size());
+ }
+ return bitsChanged;
+ }
+
+ public boolean mightContain(BytesRef value) {
+ long hash64 = hash3_x64_128(value.bytes, value.offset, value.length, 0);
+ int hash1 = (int) hash64;
+ int hash2 = (int) (hash64 >>> 32);
+ for (int i = 1; i <= numHashFunctions; i++) {
+ int nextHash = hash1 + i * hash2;
+ if (nextHash < 0) {
+ nextHash = ~nextHash;
+ }
+ if (!bits.get(nextHash % bits.size())) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public int getNumHashFunctions() {
+ return this.numHashFunctions;
+ }
+
+ public long getSizeInBytes() {
+ return bits.size() + 8;
+ }
+
+ /**
+ * Returns the probability that {@linkplain #mightContain(BytesRef)} will erroneously return
+ * {@code true} for an object that has not actually been put in the {@code BloomFilter}.
+ *
+ * Ideally, this number should be close to the {@code fpp} parameter
+ * passed in create, or smaller. If it is
+ * significantly higher, it is usually the case that too many elements (more than
+ * expected) have been put in the {@code BloomFilter}, degenerating it.
+ */
+ public double getExpectedFpp() {
+ // You down with FPP? (Yeah you know me!) Who's down with FPP? (Every last homie!)
+ return Math.pow((double) bits.bitCount() / bits.size(), numHashFunctions);
+ }
+
+ /*
+ * Cheat sheet:
+ *
+ * m: total bits
+ * n: expected insertions
+ * b: m/n, bits per insertion
+
+ * p: expected false positive probability
+ *
+ * 1) Optimal k = b * ln2
+ * 2) p = (1 - e ^ (-kn/m))^k
+ * 3) For optimal k: p = 2 ^ (-k) ~= 0.6185^b
+ * 4) For optimal k: m = -nlnp / ((ln2) ^ 2)
+ */
+
+ /**
+ * Computes the optimal k (number of hashes per element inserted in Bloom filter), given the
+ * expected insertions and total number of bits in the Bloom filter.
+ *
+ * See http://en.wikipedia.org/wiki/File:Bloom_filter_fp_probability.svg for the formula.
+ *
+ * @param n expected insertions (must be positive)
+ * @param m total number of bits in Bloom filter (must be positive)
+ */
+ static int optimalNumOfHashFunctions(long n, long m) {
+ return Math.max(1, (int) Math.round(m / n * Math.log(2)));
+ }
+
+ /**
+ * Computes m (total bits of Bloom filter) which is expected to achieve, for the specified
+ * expected insertions, the required false positive probability.
+ *
+ * See http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives for the formula.
+ *
+ * @param n expected insertions (must be positive)
+ * @param p false positive rate (must be 0 < p < 1)
+ */
+ static long optimalNumOfBits(long n, double p) {
+ if (p == 0) {
+ p = Double.MIN_VALUE;
+ }
+ return (long) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
+ }
+
+ // START : MURMUR 3_128
+
+ protected static long getblock(byte[] key, int offset, int index) {
+ int i_8 = index << 3;
+ int blockOffset = offset + i_8;
+ return ((long) key[blockOffset + 0] & 0xff) + (((long) key[blockOffset + 1] & 0xff) << 8) +
+ (((long) key[blockOffset + 2] & 0xff) << 16) + (((long) key[blockOffset + 3] & 0xff) << 24) +
+ (((long) key[blockOffset + 4] & 0xff) << 32) + (((long) key[blockOffset + 5] & 0xff) << 40) +
+ (((long) key[blockOffset + 6] & 0xff) << 48) + (((long) key[blockOffset + 7] & 0xff) << 56);
+ }
+
+ protected static long rotl64(long v, int n) {
+ return ((v << n) | (v >>> (64 - n)));
+ }
+
+ protected static long fmix(long k) {
+ k ^= k >>> 33;
+ k *= 0xff51afd7ed558ccdL;
+ k ^= k >>> 33;
+ k *= 0xc4ceb9fe1a85ec53L;
+ k ^= k >>> 33;
+
+ return k;
+ }
+
+ public static long hash3_x64_128(byte[] key, int offset, int length, long seed) {
+ final int nblocks = length >> 4; // Process as 128-bit blocks.
+
+ long h1 = seed;
+ long h2 = seed;
+
+ long c1 = 0x87c37b91114253d5L;
+ long c2 = 0x4cf5ad432745937fL;
+
+ //----------
+ // body
+
+ for (int i = 0; i < nblocks; i++) {
+ long k1 = getblock(key, offset, i * 2 + 0);
+ long k2 = getblock(key, offset, i * 2 + 1);
+
+ k1 *= c1;
+ k1 = rotl64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = rotl64(h1, 27);
+ h1 += h2;
+ h1 = h1 * 5 + 0x52dce729;
+
+ k2 *= c2;
+ k2 = rotl64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = rotl64(h2, 31);
+ h2 += h1;
+ h2 = h2 * 5 + 0x38495ab5;
+ }
+
+ //----------
+ // tail
+
+ // Advance offset to the unprocessed tail of the data.
+ offset += nblocks * 16;
+
+ long k1 = 0;
+ long k2 = 0;
+
+ switch (length & 15) {
+ case 15:
+ k2 ^= ((long) key[offset + 14]) << 48;
+ case 14:
+ k2 ^= ((long) key[offset + 13]) << 40;
+ case 13:
+ k2 ^= ((long) key[offset + 12]) << 32;
+ case 12:
+ k2 ^= ((long) key[offset + 11]) << 24;
+ case 11:
+ k2 ^= ((long) key[offset + 10]) << 16;
+ case 10:
+ k2 ^= ((long) key[offset + 9]) << 8;
+ case 9:
+ k2 ^= ((long) key[offset + 8]) << 0;
+ k2 *= c2;
+ k2 = rotl64(k2, 33);
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= ((long) key[offset + 7]) << 56;
+ case 7:
+ k1 ^= ((long) key[offset + 6]) << 48;
+ case 6:
+ k1 ^= ((long) key[offset + 5]) << 40;
+ case 5:
+ k1 ^= ((long) key[offset + 4]) << 32;
+ case 4:
+ k1 ^= ((long) key[offset + 3]) << 24;
+ case 3:
+ k1 ^= ((long) key[offset + 2]) << 16;
+ case 2:
+ k1 ^= ((long) key[offset + 1]) << 8;
+ case 1:
+ k1 ^= ((long) key[offset]);
+ k1 *= c1;
+ k1 = rotl64(k1, 31);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ //----------
+ // finalization
+
+ h1 ^= length;
+ h2 ^= length;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 = fmix(h1);
+ h2 = fmix(h2);
+
+ h1 += h2;
+ h2 += h1;
+
+ //return (new long[]{h1, h2});
+ // SAME AS GUAVA, they take the first long out of the 128bit
+ return h1;
+ }
+
+ // END: MURMUR 3_128
+
+ // Note: We use this instead of java.util.BitSet because we need access to the long[] data field
+ static class BitArray {
+ final long[] data;
+ int bitCount;
+
+ BitArray(long bits) {
+ this(new long[Ints.checkedCast(LongMath.divide(bits, 64, RoundingMode.CEILING))]);
+ }
+
+ // Used by serialization
+ BitArray(long[] data) {
+ this.data = data;
+ int bitCount = 0;
+ for (long value : data) {
+ bitCount += Long.bitCount(value);
+ }
+ this.bitCount = bitCount;
+ }
+
+ /**
+ * Returns true if the bit changed value.
+ */
+ boolean set(int index) {
+ if (!get(index)) {
+ data[index >> 6] |= (1L << index);
+ bitCount++;
+ return true;
+ }
+ return false;
+ }
+
+ boolean get(int index) {
+ return (data[index >> 6] & (1L << index)) != 0;
+ }
+
+ /**
+ * Number of bits
+ */
+ int size() {
+ return data.length * Long.SIZE;
+ }
+
+ /**
+ * Number of set bits (1s)
+ */
+ int bitCount() {
+ return bitCount;
+ }
+
+ BitArray copy() {
+ return new BitArray(data.clone());
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o instanceof BitArray) {
+ BitArray bitArray = (BitArray) o;
+ return Arrays.equals(data, bitArray.data);
+ }
+ return false;
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(data);
+ }
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterLucenePostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterLucenePostingsFormatProvider.java
new file mode 100644
index 00000000000..180d353942e
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterLucenePostingsFormatProvider.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.codec.postingsformat;
+
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.bloom.BloomFilterFactory;
+import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
+import org.apache.lucene.codecs.bloom.FuzzySet;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.Map;
+
+/**
+ */
+public class BloomFilterLucenePostingsFormatProvider extends AbstractPostingsFormatProvider {
+
+ public static final class Defaults {
+ public static final float MAX_SATURATION = 0.1f;
+ public static final float SATURATION_LIMIT = 0.9f;
+ }
+
+ private final float desiredMaxSaturation;
+ private final float saturationLimit;
+ private final PostingsFormatProvider delegate;
+ private final BloomFilteringPostingsFormat postingsFormat;
+
+ @Inject
+ public BloomFilterLucenePostingsFormatProvider(@IndexSettings Settings indexSettings, @Nullable Map postingFormatFactories, @Assisted String name, @Assisted Settings postingsFormatSettings) {
+ super(name);
+ this.desiredMaxSaturation = postingsFormatSettings.getAsFloat("desired_max_saturation", Defaults.MAX_SATURATION);
+ this.saturationLimit = postingsFormatSettings.getAsFloat("saturation_limit", Defaults.SATURATION_LIMIT);
+ this.delegate = Helper.lookup(indexSettings, postingsFormatSettings.get("delegate"), postingFormatFactories);
+ this.postingsFormat = new BloomFilteringPostingsFormat(
+ delegate.get(),
+ new CustomBloomFilterFactory(desiredMaxSaturation, saturationLimit)
+ );
+ }
+
+ public float desiredMaxSaturation() {
+ return desiredMaxSaturation;
+ }
+
+ public float saturationLimit() {
+ return saturationLimit;
+ }
+
+ public PostingsFormatProvider delegate() {
+ return delegate;
+ }
+
+ @Override
+ public PostingsFormat get() {
+ return postingsFormat;
+ }
+
+ public static class CustomBloomFilterFactory extends BloomFilterFactory {
+
+ private final float desiredMaxSaturation;
+ private final float saturationLimit;
+
+ public CustomBloomFilterFactory() {
+ this(Defaults.MAX_SATURATION, Defaults.SATURATION_LIMIT);
+ }
+
+ CustomBloomFilterFactory(float desiredMaxSaturation, float saturationLimit) {
+ this.desiredMaxSaturation = desiredMaxSaturation;
+ this.saturationLimit = saturationLimit;
+ }
+
+ @Override
+ public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
+ //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with desiredMaxSaturation% of bits set
+ return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), desiredMaxSaturation);
+ }
+
+ @Override
+ public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
+ // Don't bother saving bitsets if > saturationLimit % of bits are set - we don't want to
+ // throw any more memory at this problem.
+ return bloomFilter.getSaturation() > saturationLimit;
+ }
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
new file mode 100644
index 00000000000..5bac2327bb9
--- /dev/null
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormat.java
@@ -0,0 +1,452 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.codec.postingsformat;
+
+import org.apache.lucene.codecs.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.Map.Entry;
+
+/**
+ *
+ * A {@link PostingsFormat} useful for low doc-frequency fields such as primary
+ * keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
+ * for reads in segments known to have no record of the key. A choice of
+ * delegate PostingsFormat is used to record all other Postings data.
+ *
+ *
+ * This is a special bloom filter version, based on {@link BloomFilter} and inspired
+ * by Lucene {@link org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat}.
+ *
+ */
+public final class BloomFilterPostingsFormat extends PostingsFormat {
+
+ public static final String BLOOM_CODEC_NAME = "XBloomFilter"; // the Lucene one is named BloomFilter
+ public static final int BLOOM_CODEC_VERSION = 1;
+
+ /**
+ * Extension of Bloom Filters file
+ */
+ static final String BLOOM_EXTENSION = "blm";
+
+ private BloomFilter.Factory bloomFilterFactory = BloomFilter.Factory.DEFAULT;
+ private PostingsFormat delegatePostingsFormat;
+
+ /**
+ * Creates Bloom filters for a selection of fields created in the index. This
+ * is recorded as a set of Bitsets held as a segment summary in an additional
+ * "blm" file. This PostingsFormat delegates to a choice of delegate
+ * PostingsFormat for encoding all other postings data.
+ *
+ * @param delegatePostingsFormat The PostingsFormat that records all the non-bloom filter data i.e.
+ * postings info.
+ * @param bloomFilterFactory The {@link BloomFilter.Factory} responsible for sizing BloomFilters
+ * appropriately
+ */
+ public BloomFilterPostingsFormat(PostingsFormat delegatePostingsFormat,
+ BloomFilter.Factory bloomFilterFactory) {
+ super(BLOOM_CODEC_NAME);
+ this.delegatePostingsFormat = delegatePostingsFormat;
+ this.bloomFilterFactory = bloomFilterFactory;
+ }
+
+ // Used only by core Lucene at read-time via Service Provider instantiation -
+ // do not use at Write-time in application code.
+ public BloomFilterPostingsFormat() {
+ super(BLOOM_CODEC_NAME);
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+ throws IOException {
+ if (delegatePostingsFormat == null) {
+ throw new UnsupportedOperationException("Error - " + getClass().getName()
+ + " has been constructed without a choice of PostingsFormat");
+ }
+ return new BloomFilteredFieldsConsumer(
+ delegatePostingsFormat.fieldsConsumer(state), state,
+ delegatePostingsFormat);
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state)
+ throws IOException {
+ return new BloomFilteredFieldsProducer(state);
+ }
+
+ public class BloomFilteredFieldsProducer extends FieldsProducer {
+ private FieldsProducer delegateFieldsProducer;
+ HashMap bloomsByFieldName = new HashMap();
+
+ public BloomFilteredFieldsProducer(SegmentReadState state)
+ throws IOException {
+
+ String bloomFileName = IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+ IndexInput bloomIn = null;
+ boolean success = false;
+ try {
+ bloomIn = state.dir.openInput(bloomFileName, state.context);
+ CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
+ BLOOM_CODEC_VERSION);
+ // // Load the hash function used in the BloomFilter
+ // hashFunction = HashFunction.forName(bloomIn.readString());
+ // Load the delegate postings format
+ PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
+ .readString());
+
+ this.delegateFieldsProducer = delegatePostingsFormat
+ .fieldsProducer(state);
+ int numBlooms = bloomIn.readInt();
+ for (int i = 0; i < numBlooms; i++) {
+ int fieldNum = bloomIn.readInt();
+ BloomFilter bloom = BloomFilter.deserialize(bloomIn);
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
+ bloomsByFieldName.put(fieldInfo.name, bloom);
+ }
+ IOUtils.close(bloomIn);
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(bloomIn, delegateFieldsProducer);
+ }
+ }
+ }
+
+ @Override
+ public Iterator iterator() {
+ return delegateFieldsProducer.iterator();
+ }
+
+ @Override
+ public void close() throws IOException {
+ delegateFieldsProducer.close();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ BloomFilter filter = bloomsByFieldName.get(field);
+ if (filter == null) {
+ return delegateFieldsProducer.terms(field);
+ } else {
+ Terms result = delegateFieldsProducer.terms(field);
+ if (result == null) {
+ return null;
+ }
+ return new BloomFilteredTerms(result, filter);
+ }
+ }
+
+ @Override
+ public int size() {
+ return delegateFieldsProducer.size();
+ }
+
+ public long getUniqueTermCount() throws IOException {
+ return delegateFieldsProducer.getUniqueTermCount();
+ }
+
+ public class BloomFilteredTerms extends Terms {
+ private Terms delegateTerms;
+ private BloomFilter filter;
+
+ public BloomFilteredTerms(Terms terms, BloomFilter filter) {
+ this.delegateTerms = terms;
+ this.filter = filter;
+ }
+
+ public BloomFilter getFilter() {
+ return filter;
+ }
+
+ @Override
+ public TermsEnum intersect(CompiledAutomaton compiled,
+ final BytesRef startTerm) throws IOException {
+ return delegateTerms.intersect(compiled, startTerm);
+ }
+
+ @Override
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ TermsEnum result;
+ if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
+ // recycle the existing BloomFilteredTermsEnum by asking the delegate
+ // to recycle its contained TermsEnum
+ BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
+ if (bfte.filter == filter) {
+ bfte.delegateTermsEnum = delegateTerms.iterator(bfte.delegateTermsEnum);
+ return bfte;
+ }
+ }
+ // We have been handed something we cannot reuse (either null, wrong
+ // class or wrong filter) so allocate a new object
+ result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse), filter);
+ return result;
+ }
+
+ @Override
+ public Comparator getComparator() throws IOException {
+ return delegateTerms.getComparator();
+ }
+
+ @Override
+ public long size() throws IOException {
+ return delegateTerms.size();
+ }
+
+ @Override
+ public long getSumTotalTermFreq() throws IOException {
+ return delegateTerms.getSumTotalTermFreq();
+ }
+
+ @Override
+ public long getSumDocFreq() throws IOException {
+ return delegateTerms.getSumDocFreq();
+ }
+
+ @Override
+ public int getDocCount() throws IOException {
+ return delegateTerms.getDocCount();
+ }
+
+ @Override
+ public boolean hasOffsets() {
+ return delegateTerms.hasOffsets();
+ }
+
+ @Override
+ public boolean hasPositions() {
+ return delegateTerms.hasPositions();
+ }
+
+ @Override
+ public boolean hasPayloads() {
+ return delegateTerms.hasPayloads();
+ }
+ }
+
+ class BloomFilteredTermsEnum extends TermsEnum {
+
+ TermsEnum delegateTermsEnum;
+ private BloomFilter filter;
+
+ public BloomFilteredTermsEnum(TermsEnum iterator, BloomFilter filter) {
+ this.delegateTermsEnum = iterator;
+ this.filter = filter;
+ }
+
+ @Override
+ public final BytesRef next() throws IOException {
+ return delegateTermsEnum.next();
+ }
+
+ @Override
+ public final Comparator getComparator() {
+ return delegateTermsEnum.getComparator();
+ }
+
+ @Override
+ public final boolean seekExact(BytesRef text, boolean useCache)
+ throws IOException {
+ // The magical fail-fast speed up that is the entire point of all of
+ // this code - save a disk seek if there is a match on an in-memory
+ // structure
+ // that may occasionally give a false positive but guaranteed no false
+ // negatives
+ if (!filter.mightContain(text)) {
+ return false;
+ }
+ return delegateTermsEnum.seekExact(text, useCache);
+ }
+
+ @Override
+ public final SeekStatus seekCeil(BytesRef text, boolean useCache)
+ throws IOException {
+ return delegateTermsEnum.seekCeil(text, useCache);
+ }
+
+ @Override
+ public final void seekExact(long ord) throws IOException {
+ delegateTermsEnum.seekExact(ord);
+ }
+
+ @Override
+ public final BytesRef term() throws IOException {
+ return delegateTermsEnum.term();
+ }
+
+ @Override
+ public final long ord() throws IOException {
+ return delegateTermsEnum.ord();
+ }
+
+ @Override
+ public final int docFreq() throws IOException {
+ return delegateTermsEnum.docFreq();
+ }
+
+ @Override
+ public final long totalTermFreq() throws IOException {
+ return delegateTermsEnum.totalTermFreq();
+ }
+
+
+ @Override
+ public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
+ DocsAndPositionsEnum reuse, int flags) throws IOException {
+ return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
+ }
+
+ @Override
+ public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
+ throws IOException {
+ return delegateTermsEnum.docs(liveDocs, reuse, flags);
+ }
+
+
+ }
+
+ }
+
+ class BloomFilteredFieldsConsumer extends FieldsConsumer {
+ private FieldsConsumer delegateFieldsConsumer;
+ private Map bloomFilters = new HashMap();
+ private SegmentWriteState state;
+
+ // private PostingsFormat delegatePostingsFormat;
+
+ public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
+ SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
+ this.delegateFieldsConsumer = fieldsConsumer;
+ // this.delegatePostingsFormat=delegatePostingsFormat;
+ this.state = state;
+ }
+
+ @Override
+ public TermsConsumer addField(FieldInfo field) throws IOException {
+ BloomFilter bloomFilter = bloomFilterFactory.createFilter(state.segmentInfo.getDocCount());
+ if (bloomFilter != null) {
+ assert bloomFilters.containsKey(field) == false;
+ bloomFilters.put(field, bloomFilter);
+ return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field), bloomFilter);
+ } else {
+ // No, use the unfiltered fieldsConsumer - we are not interested in
+ // recording any term Bitsets.
+ return delegateFieldsConsumer.addField(field);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ delegateFieldsConsumer.close();
+ // Now we are done accumulating values for these fields
+ List> nonSaturatedBlooms = new ArrayList>();
+
+ for (Entry entry : bloomFilters.entrySet()) {
+ BloomFilter bloomFilter = entry.getValue();
+ //if (!bloomFilterFactory.isSaturated(bloomFilter, entry.getKey())) {
+ nonSaturatedBlooms.add(entry);
+ //}
+ }
+ String bloomFileName = IndexFileNames.segmentFileName(
+ state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+ IndexOutput bloomOutput = null;
+ try {
+ bloomOutput = state.directory
+ .createOutput(bloomFileName, state.context);
+ CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
+ BLOOM_CODEC_VERSION);
+ // remember the name of the postings format we will delegate to
+ bloomOutput.writeString(delegatePostingsFormat.getName());
+
+ // First field in the output file is the number of fields+blooms saved
+ bloomOutput.writeInt(nonSaturatedBlooms.size());
+ for (Entry entry : nonSaturatedBlooms) {
+ FieldInfo fieldInfo = entry.getKey();
+ BloomFilter bloomFilter = entry.getValue();
+ bloomOutput.writeInt(fieldInfo.number);
+ saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
+ }
+ } finally {
+ IOUtils.close(bloomOutput);
+ }
+ //We are done with large bitsets so no need to keep them hanging around
+ bloomFilters.clear();
+ }
+
+ private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
+ BloomFilter bloomFilter, FieldInfo fieldInfo) throws IOException {
+
+// FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
+// bloomFilter);
+// if (rightSizedSet == null) {
+// rightSizedSet = bloomFilter;
+// }
+// rightSizedSet.serialize(bloomOutput);
+ BloomFilter.serilaize(bloomFilter, bloomOutput);
+ }
+
+ }
+
+ class WrappedTermsConsumer extends TermsConsumer {
+ private TermsConsumer delegateTermsConsumer;
+ private BloomFilter bloomFilter;
+
+ public WrappedTermsConsumer(TermsConsumer termsConsumer, BloomFilter bloomFilter) {
+ this.delegateTermsConsumer = termsConsumer;
+ this.bloomFilter = bloomFilter;
+ }
+
+ @Override
+ public PostingsConsumer startTerm(BytesRef text) throws IOException {
+ return delegateTermsConsumer.startTerm(text);
+ }
+
+ @Override
+ public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+
+ // Record this term in our BloomFilter
+ if (stats.docFreq > 0) {
+ bloomFilter.put(text);
+ }
+ delegateTermsConsumer.finishTerm(text, stats);
+ }
+
+ @Override
+ public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+ throws IOException {
+ delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
+ }
+
+ @Override
+ public Comparator getComparator() throws IOException {
+ return delegateTermsConsumer.getComparator();
+ }
+
+ }
+
+}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java
index 3ddf2514a7e..d4c7f259329 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/BloomFilterPostingsFormatProvider.java
@@ -20,11 +20,6 @@
package org.elasticsearch.index.codec.postingsformat;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.bloom.BloomFilterFactory;
-import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
-import org.apache.lucene.codecs.bloom.FuzzySet;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.SegmentWriteState;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
@@ -37,36 +32,19 @@ import java.util.Map;
*/
public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatProvider {
- public static final class Defaults {
- public static final float MAX_SATURATION = 0.1f;
- public static final float SATURATION_LIMIT = 0.9f;
- }
-
- private final float desiredMaxSaturation;
- private final float saturationLimit;
private final PostingsFormatProvider delegate;
- private final BloomFilteringPostingsFormat postingsFormat;
+ private final BloomFilterPostingsFormat postingsFormat;
@Inject
public BloomFilterPostingsFormatProvider(@IndexSettings Settings indexSettings, @Nullable Map postingFormatFactories, @Assisted String name, @Assisted Settings postingsFormatSettings) {
super(name);
- this.desiredMaxSaturation = postingsFormatSettings.getAsFloat("desired_max_saturation", Defaults.MAX_SATURATION);
- this.saturationLimit = postingsFormatSettings.getAsFloat("saturation_limit", Defaults.SATURATION_LIMIT);
this.delegate = Helper.lookup(indexSettings, postingsFormatSettings.get("delegate"), postingFormatFactories);
- this.postingsFormat = new BloomFilteringPostingsFormat(
+ this.postingsFormat = new BloomFilterPostingsFormat(
delegate.get(),
- new CustomBloomFilterFactory(desiredMaxSaturation, saturationLimit)
+ BloomFilter.Factory.buildFromString(indexSettings.get("fpp"))
);
}
- public float desiredMaxSaturation() {
- return desiredMaxSaturation;
- }
-
- public float saturationLimit() {
- return saturationLimit;
- }
-
public PostingsFormatProvider delegate() {
return delegate;
}
@@ -75,32 +53,4 @@ public class BloomFilterPostingsFormatProvider extends AbstractPostingsFormatPro
public PostingsFormat get() {
return postingsFormat;
}
-
- public static class CustomBloomFilterFactory extends BloomFilterFactory {
-
- private final float desiredMaxSaturation;
- private final float saturationLimit;
-
- public CustomBloomFilterFactory() {
- this(Defaults.MAX_SATURATION, Defaults.SATURATION_LIMIT);
- }
-
- CustomBloomFilterFactory(float desiredMaxSaturation, float saturationLimit) {
- this.desiredMaxSaturation = desiredMaxSaturation;
- this.saturationLimit = saturationLimit;
- }
-
- @Override
- public FuzzySet getSetForField(SegmentWriteState state, FieldInfo info) {
- //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with desiredMaxSaturation% of bits set
- return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), desiredMaxSaturation);
- }
-
- @Override
- public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
- // Don't bother saving bitsets if > saturationLimit % of bits are set - we don't want to
- // throw any more memory at this problem.
- return bloomFilter.getSaturation() > saturationLimit;
- }
- }
}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
index e9410f33d9c..957f74112a1 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java
@@ -37,23 +37,23 @@ import org.elasticsearch.common.collect.MapBuilder;
* its terms and postings directly into memory. Note this postings format is
* very memory intensive and has certain limitation that don't allow segments to
* grow beyond 2.1GB see {@link DirectPostingsFormat} for details.
- *
+ *
* memory: a postings format that stores its entire terms, postings,
* positions and payloads in a finite state transducer. This format should only
* be used for primary keys or with fields where each term is contained in a
* very low number of documents.
- *
+ *
* pulsing: a postings format in-lines the posting lists for very low
* frequent terms in the term dictionary. This is useful to improve lookup
* performance for low-frequent terms.
- *
+ *
* bloom_default: a postings format that uses a bloom filter to
* improve term lookup performance. This is useful for primarily keys or fields
* that are used as a delete key
- *
+ *
* bloom_pulsing: a postings format that combines the advantages of
* bloom and pulsing to further improve lookup performance
- *
+ *
* default: the default Elasticsearch postings format offering best
* general purpose performance. This format is used if no postings format is
* specified in the field mapping.
@@ -73,13 +73,23 @@ public class PostingFormats {
buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat()));
// LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade
buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing41PostingsFormat()));
- buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", new BloomFilteringPostingsFormat(new Pulsing41PostingsFormat(), new BloomFilterPostingsFormatProvider.CustomBloomFilterFactory())));
buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", new Lucene41PostingsFormat()));
- buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", new BloomFilteringPostingsFormat(new Lucene41PostingsFormat(), new BloomFilterPostingsFormatProvider.CustomBloomFilterFactory())));
+
+ buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(new Pulsing41PostingsFormat())));
+ buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(new Lucene41PostingsFormat())));
builtInPostingFormats = buildInPostingFormatsX.immutableMap();
}
+ public static final boolean luceneBloomFilter = false;
+
+ static PostingsFormat wrapInBloom(PostingsFormat delegate) {
+ if (luceneBloomFilter) {
+ return new BloomFilteringPostingsFormat(delegate, new BloomFilterLucenePostingsFormatProvider.CustomBloomFilterFactory());
+ }
+ return new BloomFilterPostingsFormat(delegate, BloomFilter.Factory.DEFAULT);
+ }
+
public static PostingsFormatProvider.Factory getAsFactory(String name) {
return builtInPostingFormats.get(name);
}
diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
index d46e8976685..7e94b5f8a40 100644
--- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
+++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingsFormatProvider.java
@@ -50,12 +50,12 @@ import java.util.Map;
* postings format is exposed via
* index.codec.postings_format.elastic_fantastic.type : "ElasticFantastic".
*
- *
+ *
* @see CodecModule
*/
public interface PostingsFormatProvider {
public static final String POSTINGS_FORMAT_SETTINGS_PREFIX = "index.codec.postings_format";
-
+
/**
* A helper class to lookup {@link PostingsFormatProvider providers} by their unique {@link PostingsFormatProvider#name() name}
*/
@@ -64,16 +64,17 @@ public interface PostingsFormatProvider {
/**
* Looks up and creates {@link PostingsFormatProvider} for the given name.
*
- * The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
+ * The settings for the created {@link PostingsFormatProvider} is taken from the given index settings.
* All settings with the {@value PostingsFormatProvider#POSTINGS_FORMAT_SETTINGS_PREFIX} prefix
* and the formats name as the key are passed to the factory.
- *
- *
- * @param indexSettings the index settings to configure the postings format
- * @param name the name of the postings format to lookup
+ *
+ *
+ * @param indexSettings the index settings to configure the postings format
+ * @param name the name of the postings format to lookup
* @param postingFormatFactories the factory mapping to lookup the {@link Factory} to create the {@link PostingsFormatProvider}
* @return a fully configured {@link PostingsFormatProvider} for the given name.
- * @throws ElasticSearchIllegalArgumentException if the no {@link PostingsFormatProvider} for the given name parameter could be found.
+ * @throws ElasticSearchIllegalArgumentException
+ * if the no {@link PostingsFormatProvider} for the given name parameter could be found.
*/
public static PostingsFormatProvider lookup(@IndexSettings Settings indexSettings, String name, Map postingFormatFactories) throws ElasticSearchIllegalArgumentException {
Factory factory = postingFormatFactories.get(name);
@@ -100,7 +101,7 @@ public interface PostingsFormatProvider {
/**
* A simple factory used to create {@link PostingsFormatProvider} used by
- * delegating providers like {@link BloomFilterPostingsFormatProvider} or
+ * delegating providers like {@link BloomFilterLucenePostingsFormatProvider} or
* {@link PulsingPostingsFormatProvider}. Those providers wrap other
* postings formats to enrich their capabilities.
*/
diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec b/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat b/src/main/resources/META-INF/services/org.apache.lucene.codecs.DocValuesFormat
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
new file mode 100644
index 00000000000..3147f905bf8
--- /dev/null
+++ b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
@@ -0,0 +1 @@
+org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat
diff --git a/src/test/java/org/elasticsearch/benchmark/bloom/BloomBench.java b/src/test/java/org/elasticsearch/benchmark/bloom/BloomBench.java
new file mode 100644
index 00000000000..e9bb2e8551d
--- /dev/null
+++ b/src/test/java/org/elasticsearch/benchmark/bloom/BloomBench.java
@@ -0,0 +1,42 @@
+package org.elasticsearch.benchmark.bloom;
+
+import org.apache.lucene.codecs.bloom.FuzzySet;
+import org.apache.lucene.util.BytesRef;
+import org.elasticsearch.common.UUID;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.SizeValue;
+import org.elasticsearch.index.codec.postingsformat.BloomFilter;
+
+/**
+ */
+public class BloomBench {
+
+ public static void main(String[] args) throws Exception {
+ final int ELEMENTS = (int) SizeValue.parseSizeValue("1m").singles();
+ final double fpp = 0.01;
+ BloomFilter gFilter = BloomFilter.create(ELEMENTS, fpp);
+ System.out.println("G SIZE: " + new ByteSizeValue(gFilter.getSizeInBytes()));
+
+ FuzzySet lFilter = FuzzySet.createSetBasedOnMaxMemory((int) gFilter.getSizeInBytes());
+ //FuzzySet lFilter = FuzzySet.createSetBasedOnQuality(ELEMENTS, 0.97f);
+
+ for (int i = 0; i < ELEMENTS; i++) {
+ BytesRef bytesRef = new BytesRef(UUID.randomBase64UUID());
+ gFilter.put(bytesRef);
+ lFilter.addValue(bytesRef);
+ }
+
+ int lFalse = 0;
+ int gFalse = 0;
+ for (int i = 0; i < ELEMENTS; i++) {
+ BytesRef bytesRef = new BytesRef(UUID.randomBase64UUID());
+ if (gFilter.mightContain(bytesRef)) {
+ gFalse++;
+ }
+ if (lFilter.contains(bytesRef) == FuzzySet.ContainsResult.MAYBE) {
+ lFalse++;
+ }
+ }
+ System.out.println("Failed positives, g[" + gFalse + "], l[" + lFalse + "]");
+ }
+}
diff --git a/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java b/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java
index 07983cdccc7..133f6b7b8bd 100644
--- a/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java
@@ -22,7 +22,6 @@ package org.elasticsearch.test.unit.index.codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.lucene40.Lucene40Codec;
-import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.codecs.lucene41.Lucene41Codec;
import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
import org.apache.lucene.codecs.memory.DirectPostingsFormat;
@@ -77,12 +76,22 @@ public class CodecTests {
assertThat(postingsFormatService.get("Lucene41").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass()));
assertThat(postingsFormatService.get("bloom_default"), instanceOf(PreBuiltPostingsFormatProvider.class));
- assertThat(postingsFormatService.get("bloom_default").get(), instanceOf(BloomFilteringPostingsFormat.class));
+ if (PostingFormats.luceneBloomFilter) {
+ assertThat(postingsFormatService.get("bloom_default").get(), instanceOf(BloomFilteringPostingsFormat.class));
+ } else {
+ assertThat(postingsFormatService.get("bloom_default").get(), instanceOf(BloomFilterPostingsFormat.class));
+ }
assertThat(postingsFormatService.get("BloomFilter"), instanceOf(PreBuiltPostingsFormatProvider.class));
assertThat(postingsFormatService.get("BloomFilter").get(), instanceOf(BloomFilteringPostingsFormat.class));
- assertThat(postingsFormatService.get("bloom_pulsing"), instanceOf(PreBuiltPostingsFormatProvider.class));
- assertThat(postingsFormatService.get("bloom_pulsing").get(), instanceOf(BloomFilteringPostingsFormat.class));
+ assertThat(postingsFormatService.get("XBloomFilter"), instanceOf(PreBuiltPostingsFormatProvider.class));
+ assertThat(postingsFormatService.get("XBloomFilter").get(), instanceOf(BloomFilterPostingsFormat.class));
+
+ if (PostingFormats.luceneBloomFilter) {
+ assertThat(postingsFormatService.get("bloom_pulsing").get(), instanceOf(BloomFilteringPostingsFormat.class));
+ } else {
+ assertThat(postingsFormatService.get("bloom_pulsing").get(), instanceOf(BloomFilterPostingsFormat.class));
+ }
assertThat(postingsFormatService.get("pulsing"), instanceOf(PreBuiltPostingsFormatProvider.class));
assertThat(postingsFormatService.get("pulsing").get(), instanceOf(Pulsing41PostingsFormat.class));
@@ -203,7 +212,7 @@ public class CodecTests {
}
@Test
- public void testResolvePostingFormatsFromMapping_bloom() throws Exception {
+ public void testResolvePostingFormatsFromMappingLuceneBloom() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties")
.startObject("field1").field("type", "string").field("postings_format", "bloom_default").endObject()
@@ -213,7 +222,7 @@ public class CodecTests {
.endObject().endObject().string();
Settings indexSettings = ImmutableSettings.settingsBuilder()
- .put("index.codec.postings_format.my_format1.type", "bloom_filter")
+ .put("index.codec.postings_format.my_format1.type", "bloom_filter_lucene")
.put("index.codec.postings_format.my_format1.desired_max_saturation", 0.2f)
.put("index.codec.postings_format.my_format1.saturation_limit", 0.8f)
.put("index.codec.postings_format.my_format1.delegate", "delegate1")
@@ -224,13 +233,21 @@ public class CodecTests {
CodecService codecService = createCodecService(indexSettings);
DocumentMapper documentMapper = codecService.mapperService().documentMapperParser().parse(mapping);
assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider(), instanceOf(PreBuiltPostingsFormatProvider.class));
- assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(BloomFilteringPostingsFormat.class));
+ if (PostingFormats.luceneBloomFilter) {
+ assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(BloomFilteringPostingsFormat.class));
+ } else {
+ assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(BloomFilterPostingsFormat.class));
+ }
assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider(), instanceOf(PreBuiltPostingsFormatProvider.class));
- assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider().get(), instanceOf(BloomFilteringPostingsFormat.class));
+ if (PostingFormats.luceneBloomFilter) {
+ assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider().get(), instanceOf(BloomFilteringPostingsFormat.class));
+ } else {
+ assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider().get(), instanceOf(BloomFilterPostingsFormat.class));
+ }
- assertThat(documentMapper.mappers().name("field3").mapper().postingsFormatProvider(), instanceOf(BloomFilterPostingsFormatProvider.class));
- BloomFilterPostingsFormatProvider provider = (BloomFilterPostingsFormatProvider) documentMapper.mappers().name("field3").mapper().postingsFormatProvider();
+ assertThat(documentMapper.mappers().name("field3").mapper().postingsFormatProvider(), instanceOf(BloomFilterLucenePostingsFormatProvider.class));
+ BloomFilterLucenePostingsFormatProvider provider = (BloomFilterLucenePostingsFormatProvider) documentMapper.mappers().name("field3").mapper().postingsFormatProvider();
assertThat(provider.desiredMaxSaturation(), equalTo(0.2f));
assertThat(provider.saturationLimit(), equalTo(0.8f));
assertThat(provider.delegate(), instanceOf(DirectPostingsFormatProvider.class));